diff --git a/noodles-bam/Cargo.toml b/noodles-bam/Cargo.toml index 06b8204da..b134bcd7e 100644 --- a/noodles-bam/Cargo.toml +++ b/noodles-bam/Cargo.toml @@ -18,6 +18,7 @@ bit-vec.workspace = true byteorder.workspace = true bytes.workspace = true futures = { workspace = true, optional = true, features = ["std"] } +indexmap.workspace = true tokio = { workspace = true, optional = true, features = ["fs", "io-util"] } noodles-bgzf = { path = "../noodles-bgzf", version = "0.25.0" } diff --git a/noodles-bam/src/bai/async/reader.rs b/noodles-bam/src/bai/async/reader.rs index aed6f2eaf..194a6e6fc 100644 --- a/noodles-bam/src/bai/async/reader.rs +++ b/noodles-bam/src/bai/async/reader.rs @@ -1,5 +1,4 @@ -use std::collections::HashMap; - +use indexmap::IndexMap; use noodles_bgzf as bgzf; use noodles_csi::{ index::{ @@ -144,7 +143,7 @@ where Ok(ReferenceSequence::new(bins, intervals, metadata)) } -async fn read_bins(reader: &mut R) -> io::Result<(HashMap, Option)> +async fn read_bins(reader: &mut R) -> io::Result<(IndexMap, Option)> where R: AsyncRead + Unpin, { @@ -152,7 +151,7 @@ where const METADATA_ID: usize = Bin::metadata_id(DEPTH); - fn duplicate_bin_error(id: usize) -> io::Result<(HashMap, Option)> { + fn duplicate_bin_error(id: usize) -> io::Result<(IndexMap, Option)> { Err(io::Error::new( io::ErrorKind::InvalidData, format!("duplicate bin ID: {id}"), @@ -163,7 +162,7 @@ where usize::try_from(n).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e)) })?; - let mut bins = HashMap::with_capacity(n_bin); + let mut bins = IndexMap::with_capacity(n_bin); let mut metadata = None; for _ in 0..n_bin { diff --git a/noodles-bam/src/bai/async/writer.rs b/noodles-bam/src/bai/async/writer.rs index 53c731866..64c6c96c2 100644 --- a/noodles-bam/src/bai/async/writer.rs +++ b/noodles-bam/src/bai/async/writer.rs @@ -1,5 +1,4 @@ -use std::collections::HashMap; - +use indexmap::IndexMap; use noodles_bgzf as bgzf; use noodles_csi::index::{ reference_sequence::{bin::Chunk, Bin, Metadata}, @@ -163,7 +162,7 @@ where async fn write_bins( writer: &mut W, - bins: &HashMap, + bins: &IndexMap, metadata: Option<&Metadata>, ) -> io::Result<()> where diff --git a/noodles-bam/src/bai/reader.rs b/noodles-bam/src/bai/reader.rs index 63ff79f7e..18bc48609 100644 --- a/noodles-bam/src/bai/reader.rs +++ b/noodles-bam/src/bai/reader.rs @@ -1,9 +1,7 @@ -use std::{ - collections::HashMap, - io::{self, Read}, -}; +use std::io::{self, Read}; use byteorder::{LittleEndian, ReadBytesExt}; +use indexmap::IndexMap; use noodles_bgzf as bgzf; use noodles_csi::{ index::{ @@ -141,7 +139,7 @@ where Ok(references) } -fn read_bins(reader: &mut R) -> io::Result<(HashMap, Option)> +fn read_bins(reader: &mut R) -> io::Result<(IndexMap, Option)> where R: Read, { @@ -149,7 +147,7 @@ where const METADATA_ID: usize = Bin::metadata_id(DEPTH); - fn duplicate_bin_error(id: usize) -> io::Result<(HashMap, Option)> { + fn duplicate_bin_error(id: usize) -> io::Result<(IndexMap, Option)> { Err(io::Error::new( io::ErrorKind::InvalidData, format!("duplicate bin ID: {id}"), @@ -160,7 +158,7 @@ where usize::try_from(n).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e)) })?; - let mut bins = HashMap::with_capacity(n_bin); + let mut bins = IndexMap::with_capacity(n_bin); let mut metadata = None; for _ in 0..n_bin { diff --git a/noodles-csi/CHANGELOG.md b/noodles-csi/CHANGELOG.md index 8e61f4b8c..6df690711 100644 --- a/noodles-csi/CHANGELOG.md +++ b/noodles-csi/CHANGELOG.md @@ -1,5 +1,17 @@ # Changelog +## Unreleased + +### Changed + + * csi/index/reference_sequence: Change bins to an ordered map ([#213]). + + Bins now maintain their insertion order. While this does not directly + affect reading and in-memory usage, it does make serialization + deterministic. + +[#213]: https://github.com/zaeleus/noodles/issues/213 + ## 0.25.1 - 2023-10-19 ### Fixed diff --git a/noodles-csi/src/async/reader.rs b/noodles-csi/src/async/reader.rs index 4a1c11e79..e9ca790f2 100644 --- a/noodles-csi/src/async/reader.rs +++ b/noodles-csi/src/async/reader.rs @@ -1,5 +1,4 @@ -use std::collections::HashMap; - +use indexmap::IndexMap; use noodles_bgzf as bgzf; use tokio::io::{self, AsyncRead, AsyncReadExt}; @@ -176,7 +175,7 @@ where async fn read_bins( reader: &mut R, depth: u8, -) -> io::Result<(HashMap, Option)> +) -> io::Result<(IndexMap, Option)> where R: AsyncRead + Unpin, { @@ -184,7 +183,7 @@ where usize::try_from(n).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e)) })?; - let mut bins = HashMap::with_capacity(n_bin); + let mut bins = IndexMap::with_capacity(n_bin); let metadata_id = Bin::metadata_id(depth); let mut metadata = None; diff --git a/noodles-csi/src/async/writer.rs b/noodles-csi/src/async/writer.rs index 747d83163..ccdaa8509 100644 --- a/noodles-csi/src/async/writer.rs +++ b/noodles-csi/src/async/writer.rs @@ -1,5 +1,4 @@ -use std::collections::HashMap; - +use indexmap::IndexMap; use noodles_bgzf as bgzf; use tokio::io::{self, AsyncWrite, AsyncWriteExt}; @@ -183,7 +182,7 @@ where async fn write_bins( writer: &mut W, depth: u8, - bins: &HashMap, + bins: &IndexMap, metadata: Option<&Metadata>, ) -> io::Result<()> where diff --git a/noodles-csi/src/index/reference_sequence.rs b/noodles-csi/src/index/reference_sequence.rs index 76980c129..316e591e9 100644 --- a/noodles-csi/src/index/reference_sequence.rs +++ b/noodles-csi/src/index/reference_sequence.rs @@ -6,9 +6,10 @@ mod metadata; pub use self::{bin::Bin, builder::Builder, metadata::Metadata}; -use std::{collections::HashMap, io, num::NonZeroUsize}; +use std::{io, num::NonZeroUsize}; use bit_vec::BitVec; +use indexmap::IndexMap; use noodles_bgzf as bgzf; use noodles_core::{region::Interval, Position}; @@ -21,7 +22,7 @@ const LINEAR_INDEX_WINDOW_SIZE: usize = 1 << 14; /// A CSI reference sequence. #[derive(Clone, Debug, Eq, PartialEq)] pub struct ReferenceSequence { - bins: HashMap, + bins: IndexMap, linear_index: Vec, metadata: Option, } @@ -42,7 +43,7 @@ impl ReferenceSequence { /// let reference_sequence = ReferenceSequence::new(Default::default(), Vec::new(), None); /// ``` pub fn new( - bins: HashMap, + bins: IndexMap, linear_index: Vec, metadata: Option, ) -> Self { @@ -64,7 +65,7 @@ impl ReferenceSequence { /// let reference_sequence = ReferenceSequence::new(Default::default(), Vec::new(), None); /// assert!(reference_sequence.bins().is_empty()); /// ``` - pub fn bins(&self) -> &HashMap { + pub fn bins(&self) -> &IndexMap { &self.bins } diff --git a/noodles-csi/src/index/reference_sequence/builder.rs b/noodles-csi/src/index/reference_sequence/builder.rs index d991177b9..35e0e9527 100644 --- a/noodles-csi/src/index/reference_sequence/builder.rs +++ b/noodles-csi/src/index/reference_sequence/builder.rs @@ -1,5 +1,4 @@ -use std::collections::HashMap; - +use indexmap::IndexMap; use noodles_bgzf as bgzf; use noodles_core::Position; @@ -11,7 +10,7 @@ use super::{ /// A CSI reference sequence builder. #[derive(Debug)] pub struct Builder { - bin_builders: HashMap, + bin_builders: IndexMap, linear_index: Vec>, start_position: bgzf::VirtualPosition, end_position: bgzf::VirtualPosition, @@ -128,7 +127,7 @@ impl Builder { impl Default for Builder { fn default() -> Self { Self { - bin_builders: HashMap::new(), + bin_builders: IndexMap::new(), linear_index: Vec::new(), start_position: bgzf::VirtualPosition::MAX, end_position: bgzf::VirtualPosition::MIN, @@ -226,11 +225,7 @@ mod tests { ReferenceSequence::new(bins, linear_index, Some(metadata)) }; - for (id, expected_bin) in expected.bins() { - let actual_bin = actual.bins().get(id).expect("missing bin"); - assert_eq!(actual_bin, expected_bin); - } - + assert_eq!(actual.bins(), expected.bins()); assert_eq!(actual.linear_index(), expected.linear_index()); assert_eq!(actual.metadata(), expected.metadata()); diff --git a/noodles-csi/src/reader.rs b/noodles-csi/src/reader.rs index d617bdec1..b33345585 100644 --- a/noodles-csi/src/reader.rs +++ b/noodles-csi/src/reader.rs @@ -1,10 +1,10 @@ use std::{ - collections::HashMap, io::{self, Read}, str, }; use byteorder::{LittleEndian, ReadBytesExt}; +use indexmap::IndexMap; use noodles_bgzf as bgzf; use super::{ @@ -234,11 +234,11 @@ where Ok(ReferenceSequence::new(bins, Vec::new(), metadata)) } -fn read_bins(reader: &mut R, depth: u8) -> io::Result<(HashMap, Option)> +fn read_bins(reader: &mut R, depth: u8) -> io::Result<(IndexMap, Option)> where R: Read, { - fn duplicate_bin_error(id: usize) -> io::Result<(HashMap, Option)> { + fn duplicate_bin_error(id: usize) -> io::Result<(IndexMap, Option)> { Err(io::Error::new( io::ErrorKind::InvalidData, format!("duplicate bin ID: {id}"), @@ -249,7 +249,7 @@ where usize::try_from(n).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e)) })?; - let mut bins = HashMap::with_capacity(n_bin); + let mut bins = IndexMap::with_capacity(n_bin); let metadata_id = Bin::metadata_id(depth); let mut metadata = None; diff --git a/noodles-csi/src/writer.rs b/noodles-csi/src/writer.rs index ac280f1eb..82e152400 100644 --- a/noodles-csi/src/writer.rs +++ b/noodles-csi/src/writer.rs @@ -1,9 +1,7 @@ -use std::{ - collections::HashMap, - io::{self, Write}, -}; +use std::io::{self, Write}; use byteorder::{LittleEndian, WriteBytesExt}; +use indexmap::IndexMap; use noodles_bgzf as bgzf; use super::{ @@ -183,7 +181,7 @@ where fn write_bins( writer: &mut W, depth: u8, - bins: &HashMap, + bins: &IndexMap, metadata: Option<&Metadata>, ) -> io::Result<()> where diff --git a/noodles-tabix/Cargo.toml b/noodles-tabix/Cargo.toml index 381aae616..cfe2515f3 100644 --- a/noodles-tabix/Cargo.toml +++ b/noodles-tabix/Cargo.toml @@ -16,6 +16,7 @@ async = ["dep:tokio", "noodles-bgzf/async"] [dependencies] bit-vec.workspace = true byteorder.workspace = true +indexmap.workspace = true noodles-bgzf = { path = "../noodles-bgzf", version = "0.25.0" } noodles-core = { path = "../noodles-core", version = "0.12.0" } noodles-csi = { path = "../noodles-csi", version = "0.25.1" } diff --git a/noodles-tabix/src/async/reader.rs b/noodles-tabix/src/async/reader.rs index ff3a853fe..481ccd34a 100644 --- a/noodles-tabix/src/async/reader.rs +++ b/noodles-tabix/src/async/reader.rs @@ -1,5 +1,4 @@ -use std::collections::HashMap; - +use indexmap::IndexMap; use noodles_bgzf as bgzf; use noodles_csi::index::{ header::{Format, ReferenceSequenceNames}, @@ -230,7 +229,7 @@ where Ok(ReferenceSequence::new(bins, intervals, metadata)) } -async fn read_bins(reader: &mut R) -> io::Result<(HashMap, Option)> +async fn read_bins(reader: &mut R) -> io::Result<(IndexMap, Option)> where R: AsyncRead + Unpin, { @@ -242,7 +241,7 @@ where usize::try_from(n).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e)) })?; - let mut bins = HashMap::with_capacity(n_bin); + let mut bins = IndexMap::with_capacity(n_bin); let mut metadata = None; for _ in 0..n_bin { diff --git a/noodles-tabix/src/async/writer.rs b/noodles-tabix/src/async/writer.rs index d82f5bf93..e5df074a6 100644 --- a/noodles-tabix/src/async/writer.rs +++ b/noodles-tabix/src/async/writer.rs @@ -1,5 +1,4 @@ -use std::collections::HashMap; - +use indexmap::IndexMap; use noodles_bgzf as bgzf; use noodles_csi::index::{ header::ReferenceSequenceNames, @@ -220,7 +219,7 @@ where async fn write_bins( writer: &mut W, - bins: &HashMap, + bins: &IndexMap, metadata: Option<&Metadata>, ) -> io::Result<()> where diff --git a/noodles-tabix/src/reader.rs b/noodles-tabix/src/reader.rs index e4ce5549b..109455a66 100644 --- a/noodles-tabix/src/reader.rs +++ b/noodles-tabix/src/reader.rs @@ -1,11 +1,10 @@ use std::{ - collections::HashMap, io::{self, Read}, str, }; -use super::MAGIC_NUMBER; use byteorder::{LittleEndian, ReadBytesExt}; +use indexmap::IndexMap; use noodles_bgzf as bgzf; use noodles_csi::{ index::{ @@ -16,6 +15,8 @@ use noodles_csi::{ Index, }; +use super::MAGIC_NUMBER; + const NUL: u8 = b'\x00'; /// A tabix reader. @@ -236,7 +237,7 @@ where Ok(references) } -fn read_bins(reader: &mut R) -> io::Result<(HashMap, Option)> +fn read_bins(reader: &mut R) -> io::Result<(IndexMap, Option)> where R: Read, { @@ -248,7 +249,7 @@ where usize::try_from(n).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e)) })?; - let mut bins = HashMap::with_capacity(n_bin); + let mut bins = IndexMap::with_capacity(n_bin); let mut metadata = None; for _ in 0..n_bin {