diff --git a/Cargo.lock b/Cargo.lock index fbe5386..73cc1f9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -900,9 +900,9 @@ dependencies = [ [[package]] name = "noodles" -version = "0.29.0" +version = "0.34.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30fd5cce0e234cf9e5c59208c06a1737db5399f526c1f58c761ea453b57cc91b" +checksum = "47a338361a01ec3ba68f4ebb5a1594444ba126478798d024d6fe8de35db108a5" dependencies = [ "noodles-bam", "noodles-bcf", @@ -920,9 +920,9 @@ dependencies = [ [[package]] name = "noodles-bam" -version = "0.24.0" +version = "0.28.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f52d708c3466b33cbf4a4e98df9fb8b1a704709c841c41c4655701f92c4c05bb" +checksum = "5b100df8df462cf3197ce770c241614e68799ee3b557f85bd11779225bd27896" dependencies = [ "bit-vec", "byteorder", @@ -938,9 +938,9 @@ dependencies = [ [[package]] name = "noodles-bcf" -version = "0.18.0" +version = "0.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "795dba2633a818b4851f7fb93c95714009ebeda556ef4c09bd145218beaea463" +checksum = "f1984f2b1c676c2652ff03e9002f50774be35fbbcd29ffee337f1e372db047a2" dependencies = [ "byteorder", "futures", @@ -954,16 +954,15 @@ dependencies = [ [[package]] name = "noodles-bgzf" -version = "0.17.0" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "796758951544bea19afd85e236ca59c7cdef588b6988dede37c7c7a1a5146211" +checksum = "2bff155a5362c61d9b977648d874aa84aea88f7de032e43df908c7cd26b22b6d" dependencies = [ "byteorder", "bytes", "crossbeam-channel", "flate2", "futures", - "num_cpus", "pin-project-lite", "tokio", "tokio-util", @@ -971,15 +970,15 @@ dependencies = [ [[package]] name = "noodles-core" -version = "0.9.0" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a535f23a76e1d159401d70d5766ab116154d6774961e00559978bf7615f72fe4" +checksum = "72f9ab09e13392e71797e7502109575d2aae5cb2002bd2304647f7746215c2fe" [[package]] name = "noodles-cram" -version = "0.21.0" +version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e068befeb207aa67519597315eae8504a80e4eac2132b3409768807f94c9de4" +checksum = "2259b516050b54a980bff6f6dd0fc4690699e5fda3f792db603222e81e1f60b6" dependencies = [ "async-compression", "bitflags", @@ -1000,9 +999,9 @@ dependencies = [ [[package]] name = "noodles-csi" -version = "0.11.0" +version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2c6494ee69043b2a7623f060bd6b7abe52f48ee10d641cf1aaf78ac861bd46b" +checksum = "56201d5278cb875d3b4a8e338a21fa5b43f6dd56547447006dbad4638e6c952e" dependencies = [ "bit-vec", "byteorder", @@ -1013,9 +1012,9 @@ dependencies = [ [[package]] name = "noodles-fasta" -version = "0.16.0" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2730b9e2c2e8422b3b62e4ca91bc784f1210ba35d67a20cb81d33ffc0e8c32c7" +checksum = "0cbcf02af0f440981cef550db3e078b588591e68abd791ab0767c599945b4b93" dependencies = [ "bytes", "memchr", @@ -1026,9 +1025,9 @@ dependencies = [ [[package]] name = "noodles-fastq" -version = "0.5.1" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e695895b63ac224f24f860f185d50638093d142e911b974c8876f7c28cabac76" +checksum = "27c7d065ab7c0814ea9df05624a1be1410d2039b091213ac1cb5281866c3fdfb" dependencies = [ "futures", "tokio", @@ -1036,9 +1035,9 @@ dependencies = [ [[package]] name = "noodles-gff" -version = "0.9.0" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8990b5fdbdccabfd5d8dbacb52087bdf42b2947b13d38a200e0887f3c7688c7" +checksum = "d6befb1186be031f48baffa083d63c39246210e5dfac3a1f8a2305f272efe12b" dependencies = [ "noodles-core", "percent-encoding", @@ -1046,9 +1045,9 @@ dependencies = [ [[package]] name = "noodles-sam" -version = "0.21.0" +version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ceb3845da63c34509424b470a5c892e1bdb9a9b5aeea826431abef41231748c" +checksum = "ca67089a8f2bedf0bfcce06758cde48fe8fc04d2a33b02aa445cdc05c798051f" dependencies = [ "bitflags", "futures", @@ -1059,15 +1058,14 @@ dependencies = [ "noodles-core", "noodles-csi", "noodles-fasta", - "rustc-hash", "tokio", ] [[package]] name = "noodles-tabix" -version = "0.14.0" +version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f25f923af05f97cb5ffc2b554bcbec6eb15ad7d199c3b89212129e364b5faac0" +checksum = "76b4c6ad670431f0d3981080621f6e64037280916bf1638c373a1b941449e4ba" dependencies = [ "bit-vec", "byteorder", @@ -1080,9 +1078,9 @@ dependencies = [ [[package]] name = "noodles-vcf" -version = "0.21.0" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ea8fd6877c229c7aff9d69621e58347120eec76ca35a2e8560df52b1cba3b44" +checksum = "fa196d6d6ad22277390e2cc588d6999a00fd73bd1ca4a5f68e8fda710c8e8364" dependencies = [ "futures", "indexmap", @@ -1372,12 +1370,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "rustc-hash" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" - [[package]] name = "rustversion" version = "1.0.9" diff --git a/Cargo.toml b/Cargo.toml index 92e6963..7808a7f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,7 +19,7 @@ git-testament = "0.2.1" indexmap = "1.9.1" indicatif = "0.16.2" itertools = "0.10.5" -noodles = { version = "0.29.0", features = [ +noodles = { version = "0.34.0", features = [ "async", "bam", "bgzf", @@ -40,7 +40,7 @@ regex = "1.5.5" rust-lapper = "1.0.1" serde = { version = "1.0.137", features = ["derive"] } serde_json = { version = "1.0.81", features = ["preserve_order"] } -tokio = { version = "1.18.0", features = ["fs", "rt-multi-thread"] } +tokio = { version = "1.18.0", features = ["fs", "io-std", "rt-multi-thread"] } tracing = "0.1.34" tracing-subscriber = "0.3.11" diff --git a/src/convert/bam.rs b/src/convert/bam.rs index 35c3354..93003a9 100644 --- a/src/convert/bam.rs +++ b/src/convert/bam.rs @@ -1,6 +1,7 @@ //! Conversions from a BAM file to other next-generation sequencing file formats. use std::io; +use std::num::NonZeroUsize; use std::path::PathBuf; use anyhow::Context; @@ -41,7 +42,7 @@ pub async fn to_sam_async( let mut record = Record::default(); // (4) Write each record in the BAM file to the SAM file. - while reader.read_record(&mut record).await? != 0 { + while reader.read_record(&header.parsed, &mut record).await? != 0 { writer .write_alignment_record(&header.parsed, &record) .await?; @@ -83,8 +84,8 @@ pub async fn to_cram_async( let name = name_as_string.parse()?; let length = record.sequence().len(); - let reference_sequence = Map::::new(name, length)?; - reference_sequences.insert(name_as_string, reference_sequence); + let reference_sequence = Map::::new(NonZeroUsize::try_from(length)?); + reference_sequences.insert(name, reference_sequence); } let repository = fasta::Repository::new(records); @@ -106,7 +107,7 @@ pub async fn to_cram_async( // (6) Write each record in the BAM file to the CRAM file. info!("Writing records to CRAM file."); - while reader.read_record(&mut record).await? != 0 { + while reader.read_record(&header.parsed, &mut record).await? != 0 { let cram_record = cram::Record::try_from_alignment_record(&header.parsed, &record)?; writer .write_record(&header.parsed, cram_record) diff --git a/src/convert/sam.rs b/src/convert/sam.rs index 8090399..4275576 100644 --- a/src/convert/sam.rs +++ b/src/convert/sam.rs @@ -1,6 +1,7 @@ //! Conversions from a SAM file to other next-generation sequencing file formats. use std::io; +use std::num::NonZeroUsize; use std::path::PathBuf; use anyhow::Context; @@ -101,8 +102,8 @@ pub async fn to_cram_async( let name = name_as_string.parse()?; let length = record.sequence().len(); - let reference_sequence = Map::::new(name, length)?; - reference_sequences.insert(name_as_string, reference_sequence); + let reference_sequence = Map::::new(NonZeroUsize::try_from(length)?); + reference_sequences.insert(name, reference_sequence); } let repository = fasta::Repository::new(records); diff --git a/src/derive/command/instrument.rs b/src/derive/command/instrument.rs index b8bcee8..c36ec19 100644 --- a/src/derive/command/instrument.rs +++ b/src/derive/command/instrument.rs @@ -54,8 +54,9 @@ async fn app(src: PathBuf, first_n_reads: Option) -> anyhow::Result<()> { let mut instrument_names = HashSet::new(); let mut flowcell_names = HashSet::new(); - let ParsedBAMFile { mut reader, .. } = - crate::utils::formats::bam::open_and_parse(src, IndexCheck::Full)?; + let ParsedBAMFile { + mut reader, header, .. + } = crate::utils::formats::bam::open_and_parse(src, IndexCheck::Full)?; // (1) Collect instrument names and flowcell names from reads within the // file. Support for sampling only a portion of the reads is provided. @@ -66,7 +67,7 @@ async fn app(src: PathBuf, first_n_reads: Option) -> anyhow::Result<()> { sample_max = s; } - for result in reader.records() { + for result in reader.records(&header.parsed) { let record = result?; if let Some(read_name) = record.read_name() { diff --git a/src/index/bam.rs b/src/index/bam.rs index 7cf0760..d9e3b4a 100644 --- a/src/index/bam.rs +++ b/src/index/bam.rs @@ -72,7 +72,7 @@ pub fn index(src: PathBuf) -> anyhow::Result<()> { let mut counter = RecordCounter::new(); loop { - match reader.read_record(&mut record) { + match reader.read_record(&header.parsed, &mut record) { Ok(0) => break, Ok(_) => {} Err(e) => bail!("failed to read record: {}", e), diff --git a/src/qc.rs b/src/qc.rs index d6e1ab2..1d5a0e2 100644 --- a/src/qc.rs +++ b/src/qc.rs @@ -7,6 +7,7 @@ use itertools::Itertools; use noodles::sam; use noodles::sam::header::record::value::map::ReferenceSequence; use noodles::sam::header::record::value::Map; +use noodles::sam::record::ReferenceSequenceName; use noodles::sam::Header; use sam::alignment::Record; @@ -200,14 +201,27 @@ pub trait SequenceBasedQualityControlFacet { fn supports_sequence_name(&self, name: &str) -> bool; /// Sets up a quality control facet for a given sequence. - fn setup(&mut self, sequence: &Map) -> anyhow::Result<()>; + fn setup( + &mut self, + name: &ReferenceSequenceName, + sequence: &Map, + ) -> anyhow::Result<()>; /// Processes a sequence for a quality control facet. - fn process(&mut self, seq: &Map, record: &Record) -> anyhow::Result<()>; + fn process( + &mut self, + name: &ReferenceSequenceName, + sequence: &Map, + record: &Record, + ) -> anyhow::Result<()>; /// Tears down any machinery that was built up for this sequence within the /// quality control facet. - fn teardown(&mut self, sequence: &Map) -> anyhow::Result<()>; + fn teardown( + &mut self, + name: &ReferenceSequenceName, + sequence: &Map, + ) -> anyhow::Result<()>; /// Adds the results of this quality control facet to the global /// [`results::Results`] object for writing to a file. diff --git a/src/qc/command.rs b/src/qc/command.rs index 99edbcf..64a3623 100644 --- a/src/qc/command.rs +++ b/src/qc/command.rs @@ -261,7 +261,7 @@ fn app( if !supported_sequences .iter() .map(|s| s.name()) - .any(|x| x == sequence) + .any(|x| x == *sequence) { bail!( "Sequence \"{}\" not found in specified reference genome. \ @@ -302,7 +302,7 @@ fn app( info!("Starting first pass for QC stats."); let mut counter = RecordCounter::new(); - for result in reader.records() { + for result in reader.records(&header.parsed) { let record = result?; for facet in &mut record_facets { @@ -361,14 +361,14 @@ fn app( debug!(" [*] Setting up sequence."); for facet in &mut sequence_facets { if facet.supports_sequence_name(name) { - facet.setup(seq)?; + facet.setup(name, seq)?; } } let query = reader.query( - header.parsed.reference_sequences(), + &header.parsed, &index, - &Region::new(name, start..=end), + &Region::new(name.to_string(), start..=end), )?; debug!(" [*] Processing records from sequence."); @@ -376,7 +376,7 @@ fn app( let record = result?; for facet in &mut sequence_facets { if facet.supports_sequence_name(name) { - facet.process(seq, &record)?; + facet.process(name, seq, &record)?; } } @@ -390,7 +390,7 @@ fn app( debug!(" [*] Tearing down sequence."); for facet in &mut sequence_facets { if facet.supports_sequence_name(name) { - facet.teardown(seq)?; + facet.teardown(name, seq)?; } } } diff --git a/src/qc/record_based/features.rs b/src/qc/record_based/features.rs index ae2f7a9..9b92cc4 100644 --- a/src/qc/record_based/features.rs +++ b/src/qc/record_based/features.rs @@ -7,6 +7,7 @@ use std::rc::Rc; use anyhow::bail; use anyhow::Context; use noodles::sam; +use noodles::sam::record::ReferenceSequenceName; use rust_lapper::Interval; use rust_lapper::Lapper; use sam::alignment::Record; @@ -80,10 +81,11 @@ impl FeatureNames { /// Main struct for the Features quality control facet. pub struct GenomicFeaturesFacet<'a> { /// Store of the cached exonic translation regions. - pub exonic_translation_regions: HashMap>, + pub exonic_translation_regions: + HashMap>, /// Store of the cached gene regions. - pub gene_regions: HashMap>, + pub gene_regions: HashMap>, /// Feature names that correspond to the respective features in the gene /// model. These are passed in on the command line. @@ -142,7 +144,7 @@ impl<'a> RecordBasedQualityControlFacet for GenomicFeaturesFacet<'a> { .header .reference_sequences() .get_index(id) - .map(|(_, rs)| Some(rs.name().as_str())) + .map(|(name, _)| Some(name)) { Some(Some(name)) => name, _ => { @@ -156,7 +158,7 @@ impl<'a> RecordBasedQualityControlFacet for GenomicFeaturesFacet<'a> { if !self .primary_chromosome_names .iter() - .any(|s: &String| s == seq_name) + .any(|s: &String| s.parse::().unwrap() == *seq_name) { self.metrics.records.ignored_nonprimary_chromosome += 1; return Ok(()); @@ -333,9 +335,9 @@ impl<'a> GenomicFeaturesFacet<'a> { utr_features.len() ); - exonic_translations.insert(parent_seq_name.to_string(), Lapper::new(utr_features)); + exonic_translations.insert(parent_seq_name.parse().unwrap(), Lapper::new(utr_features)); gene_regions.insert( - parent_seq_name.to_string(), + parent_seq_name.parse().unwrap(), Lapper::new(gene_region_features), ); } diff --git a/src/qc/sequence_based/coverage.rs b/src/qc/sequence_based/coverage.rs index 6f8a43b..33d6756 100644 --- a/src/qc/sequence_based/coverage.rs +++ b/src/qc/sequence_based/coverage.rs @@ -7,6 +7,7 @@ use std::rc::Rc; use noodles::sam::alignment::Record; use noodles::sam::header::record::value::map::Map; use noodles::sam::header::record::value::map::ReferenceSequence; +use noodles::sam::record::ReferenceSequenceName; use serde::Deserialize; use serde::Serialize; use tracing::error; @@ -136,15 +137,24 @@ impl SequenceBasedQualityControlFacet for CoverageFacet { .any(|x| x == name) } - fn setup(&mut self, _: &Map) -> anyhow::Result<()> { + fn setup( + &mut self, + _: &ReferenceSequenceName, + _: &Map, + ) -> anyhow::Result<()> { Ok(()) } - fn process(&mut self, seq: &Map, record: &Record) -> anyhow::Result<()> { + fn process( + &mut self, + name: &ReferenceSequenceName, + sequence: &Map, + record: &Record, + ) -> anyhow::Result<()> { let h = self .coverage_per_position - .entry(seq.name().to_string()) - .or_insert_with(|| Histogram::zero_based_with_capacity(usize::from(seq.length()))); + .entry(name.to_string()) + .or_insert_with(|| Histogram::zero_based_with_capacity(usize::from(sequence.length()))); let record_start = usize::from(record.alignment_start().unwrap()); let record_end = usize::from(record.alignment_end().unwrap()); @@ -169,8 +179,12 @@ impl SequenceBasedQualityControlFacet for CoverageFacet { Ok(()) } - fn teardown(&mut self, sequence: &Map) -> anyhow::Result<()> { - let positions = match self.coverage_per_position.get(sequence.name().as_str()) { + fn teardown( + &mut self, + name: &ReferenceSequenceName, + _: &Map, + ) -> anyhow::Result<()> { + let positions = match self.coverage_per_position.get(&name.to_string()) { Some(s) => s, // In the None case, no records were inserted for this sequence. // This may be because the file is a mini-SAM/BAM/CRAM. If that's @@ -186,7 +200,7 @@ impl SequenceBasedQualityControlFacet for CoverageFacet { let coverage_per_bin_vec = self .metrics .mean_coverage_per_bin - .entry(sequence.name().to_string()) + .entry(name.to_string()) .or_default(); for i in positions.range_start()..=positions.range_stop() { @@ -220,7 +234,7 @@ impl SequenceBasedQualityControlFacet for CoverageFacet { let median_over_mean = median / mean; // Removed to save memory. - self.coverage_per_position.remove(sequence.name().as_str()); + self.coverage_per_position.remove(&name.to_string()); // Adds the current sequence's coverage histogram to our global coverage // histogram. @@ -232,19 +246,17 @@ impl SequenceBasedQualityControlFacet for CoverageFacet { } // Saved for reporting. - self.metrics - .mean_coverage - .insert(sequence.name().to_string(), mean); + self.metrics.mean_coverage.insert(name.to_string(), mean); self.metrics .median_coverage - .insert(sequence.name().to_string(), median); + .insert(name.to_string(), median); self.metrics .median_over_mean_coverage - .insert(sequence.name().to_string(), median_over_mean); + .insert(name.to_string(), median_over_mean); self.metrics .ignored .pileup_too_large_positions - .insert(sequence.name().to_string(), ignored); + .insert(name.to_string(), ignored); Ok(()) } diff --git a/src/qc/sequence_based/edits.rs b/src/qc/sequence_based/edits.rs index d1846c7..05eabb5 100644 --- a/src/qc/sequence_based/edits.rs +++ b/src/qc/sequence_based/edits.rs @@ -13,6 +13,7 @@ use noodles::sam::header::record::value::{map::ReferenceSequence, Map}; use noodles::sam::record::cigar::op::Kind; use noodles::sam::record::sequence::base::TryFromCharError; use noodles::sam::record::sequence::Base; +use noodles::sam::record::ReferenceSequenceName; use serde::Deserialize; use serde::Serialize; @@ -173,8 +174,12 @@ impl SequenceBasedQualityControlFacet for EditsFacet { true } - fn setup(&mut self, sequence: &Map) -> anyhow::Result<()> { - let seq_name = sequence.name().as_str(); + fn setup( + &mut self, + name: &ReferenceSequenceName, + sequence: &Map, + ) -> anyhow::Result<()> { + let seq_name = name.to_string(); // (1) Load the reference sequence we're currently evaluating into memory. let mut fasta = @@ -209,7 +214,12 @@ impl SequenceBasedQualityControlFacet for EditsFacet { Ok(()) } - fn process<'b>(&mut self, _: &Map, record: &Record) -> anyhow::Result<()> { + fn process<'b>( + &mut self, + _: &ReferenceSequenceName, + _: &Map, + record: &Record, + ) -> anyhow::Result<()> { // (1) First, if the read is unmapped, we need to ignore it for this // analysis because there is no reference to compare it to. Further, we don't // care about duplicate reads for this analysis (and they will only serve to @@ -292,8 +302,12 @@ impl SequenceBasedQualityControlFacet for EditsFacet { Ok(()) } - fn teardown(&mut self, seq: &Map) -> anyhow::Result<()> { - let seq_name = seq.name().to_string(); + fn teardown( + &mut self, + name: &ReferenceSequenceName, + _: &Map, + ) -> anyhow::Result<()> { + let seq_name = name.to_string(); // (1) Resets the current sequence to None so that there can be no // confusion as to whether the sequence we were looking at is still diff --git a/src/utils/formats/bam.rs b/src/utils/formats/bam.rs index bd3a627..ad9d260 100644 --- a/src/utils/formats/bam.rs +++ b/src/utils/formats/bam.rs @@ -13,6 +13,7 @@ use noodles::bam::bai; use noodles::bgzf; use noodles::sam::header::record::value::map::Map; use noodles::sam::header::record::value::map::ReferenceSequence; +use noodles::sam::record::ReferenceSequenceName; use tracing::debug; use crate::utils::formats::utils::IndexCheck; @@ -64,7 +65,7 @@ pub struct ParsedBAMFile { pub header: RawAndParsedHeaders, /// The reference sequences read from the BAM file. - pub reference_sequences: IndexMap>, + pub reference_sequences: IndexMap>, /// The path to the associated BAM index file. pub index_path: PathBuf, @@ -167,7 +168,7 @@ pub struct ParsedAsyncBAMFile { pub header: RawAndParsedHeaders, /// The reference sequences read from the BAM file. - pub reference_sequences: IndexMap>, + pub reference_sequences: IndexMap>, /// The path to the associated BAM index file. pub index_path: PathBuf, diff --git a/src/view/bam.rs b/src/view/bam.rs index 557d94a..29f63ae 100644 --- a/src/view/bam.rs +++ b/src/view/bam.rs @@ -47,7 +47,7 @@ pub fn view(src: PathBuf, query: Option, mode: Mode) -> anyhow::Result<( let region = query.parse().with_context(|| "parsing query")?; let records = reader - .query(header.parsed.reference_sequences(), &index, ®ion) + .query(&header.parsed, &index, ®ion) .with_context(|| "querying BAM file")?; for result in records { @@ -56,7 +56,7 @@ pub fn view(src: PathBuf, query: Option, mode: Mode) -> anyhow::Result<( } } else { // (b) Else, print all of the records in the file. - for result in reader.records() { + for result in reader.records(&header.parsed) { let record = result?; writer.write_alignment_record(&header.parsed, &record)?; }