Skip to content

Commit

Permalink
csi/index/reference_sequence: Change bins to an ordered map
Browse files Browse the repository at this point in the history
Bins now maintain their insertion order. While this does not directly
affect reading and in-memory usage, it does make serialization
deterministic.

Closes #213.
  • Loading branch information
zaeleus committed Oct 24, 2023
1 parent bc38d95 commit 89b9acd
Show file tree
Hide file tree
Showing 15 changed files with 56 additions and 55 deletions.
1 change: 1 addition & 0 deletions noodles-bam/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ bit-vec.workspace = true
byteorder.workspace = true
bytes.workspace = true
futures = { workspace = true, optional = true, features = ["std"] }
indexmap.workspace = true
tokio = { workspace = true, optional = true, features = ["fs", "io-util"] }

noodles-bgzf = { path = "../noodles-bgzf", version = "0.25.0" }
Expand Down
9 changes: 4 additions & 5 deletions noodles-bam/src/bai/async/reader.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use std::collections::HashMap;

use indexmap::IndexMap;
use noodles_bgzf as bgzf;
use noodles_csi::{
index::{
Expand Down Expand Up @@ -144,15 +143,15 @@ where
Ok(ReferenceSequence::new(bins, intervals, metadata))
}

async fn read_bins<R>(reader: &mut R) -> io::Result<(HashMap<usize, Bin>, Option<Metadata>)>
async fn read_bins<R>(reader: &mut R) -> io::Result<(IndexMap<usize, Bin>, Option<Metadata>)>
where
R: AsyncRead + Unpin,
{
use crate::bai::DEPTH;

const METADATA_ID: usize = Bin::metadata_id(DEPTH);

fn duplicate_bin_error(id: usize) -> io::Result<(HashMap<usize, Bin>, Option<Metadata>)> {
fn duplicate_bin_error(id: usize) -> io::Result<(IndexMap<usize, Bin>, Option<Metadata>)> {
Err(io::Error::new(
io::ErrorKind::InvalidData,
format!("duplicate bin ID: {id}"),
Expand All @@ -163,7 +162,7 @@ where
usize::try_from(n).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))
})?;

let mut bins = HashMap::with_capacity(n_bin);
let mut bins = IndexMap::with_capacity(n_bin);
let mut metadata = None;

for _ in 0..n_bin {
Expand Down
5 changes: 2 additions & 3 deletions noodles-bam/src/bai/async/writer.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use std::collections::HashMap;

use indexmap::IndexMap;
use noodles_bgzf as bgzf;
use noodles_csi::index::{
reference_sequence::{bin::Chunk, Bin, Metadata},
Expand Down Expand Up @@ -163,7 +162,7 @@ where

async fn write_bins<W>(
writer: &mut W,
bins: &HashMap<usize, Bin>,
bins: &IndexMap<usize, Bin>,
metadata: Option<&Metadata>,
) -> io::Result<()>
where
Expand Down
12 changes: 5 additions & 7 deletions noodles-bam/src/bai/reader.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
use std::{
collections::HashMap,
io::{self, Read},
};
use std::io::{self, Read};

use byteorder::{LittleEndian, ReadBytesExt};
use indexmap::IndexMap;
use noodles_bgzf as bgzf;
use noodles_csi::{
index::{
Expand Down Expand Up @@ -141,15 +139,15 @@ where
Ok(references)
}

fn read_bins<R>(reader: &mut R) -> io::Result<(HashMap<usize, Bin>, Option<Metadata>)>
fn read_bins<R>(reader: &mut R) -> io::Result<(IndexMap<usize, Bin>, Option<Metadata>)>
where
R: Read,
{
use super::DEPTH;

const METADATA_ID: usize = Bin::metadata_id(DEPTH);

fn duplicate_bin_error(id: usize) -> io::Result<(HashMap<usize, Bin>, Option<Metadata>)> {
fn duplicate_bin_error(id: usize) -> io::Result<(IndexMap<usize, Bin>, Option<Metadata>)> {
Err(io::Error::new(
io::ErrorKind::InvalidData,
format!("duplicate bin ID: {id}"),
Expand All @@ -160,7 +158,7 @@ where
usize::try_from(n).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))
})?;

let mut bins = HashMap::with_capacity(n_bin);
let mut bins = IndexMap::with_capacity(n_bin);
let mut metadata = None;

for _ in 0..n_bin {
Expand Down
12 changes: 12 additions & 0 deletions noodles-csi/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,17 @@
# Changelog

## Unreleased

### Changed

* csi/index/reference_sequence: Change bins to an ordered map ([#213]).

Bins now maintain their insertion order. While this does not directly
affect reading and in-memory usage, it does make serialization
deterministic.

[#213]: https://github.com/zaeleus/noodles/issues/213

## 0.25.1 - 2023-10-19

### Fixed
Expand Down
7 changes: 3 additions & 4 deletions noodles-csi/src/async/reader.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use std::collections::HashMap;

use indexmap::IndexMap;
use noodles_bgzf as bgzf;
use tokio::io::{self, AsyncRead, AsyncReadExt};

Expand Down Expand Up @@ -176,15 +175,15 @@ where
async fn read_bins<R>(
reader: &mut R,
depth: u8,
) -> io::Result<(HashMap<usize, Bin>, Option<Metadata>)>
) -> io::Result<(IndexMap<usize, Bin>, Option<Metadata>)>
where
R: AsyncRead + Unpin,
{
let n_bin = reader.read_i32_le().await.and_then(|n| {
usize::try_from(n).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))
})?;

let mut bins = HashMap::with_capacity(n_bin);
let mut bins = IndexMap::with_capacity(n_bin);

let metadata_id = Bin::metadata_id(depth);
let mut metadata = None;
Expand Down
5 changes: 2 additions & 3 deletions noodles-csi/src/async/writer.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use std::collections::HashMap;

use indexmap::IndexMap;
use noodles_bgzf as bgzf;
use tokio::io::{self, AsyncWrite, AsyncWriteExt};

Expand Down Expand Up @@ -183,7 +182,7 @@ where
async fn write_bins<W>(
writer: &mut W,
depth: u8,
bins: &HashMap<usize, Bin>,
bins: &IndexMap<usize, Bin>,
metadata: Option<&Metadata>,
) -> io::Result<()>
where
Expand Down
9 changes: 5 additions & 4 deletions noodles-csi/src/index/reference_sequence.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@ mod metadata;

pub use self::{bin::Bin, builder::Builder, metadata::Metadata};

use std::{collections::HashMap, io, num::NonZeroUsize};
use std::{io, num::NonZeroUsize};

use bit_vec::BitVec;
use indexmap::IndexMap;
use noodles_bgzf as bgzf;
use noodles_core::{region::Interval, Position};

Expand All @@ -21,7 +22,7 @@ const LINEAR_INDEX_WINDOW_SIZE: usize = 1 << 14;
/// A CSI reference sequence.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct ReferenceSequence {
bins: HashMap<usize, Bin>,
bins: IndexMap<usize, Bin>,
linear_index: Vec<bgzf::VirtualPosition>,
metadata: Option<Metadata>,
}
Expand All @@ -42,7 +43,7 @@ impl ReferenceSequence {
/// let reference_sequence = ReferenceSequence::new(Default::default(), Vec::new(), None);
/// ```
pub fn new(
bins: HashMap<usize, Bin>,
bins: IndexMap<usize, Bin>,
linear_index: Vec<bgzf::VirtualPosition>,
metadata: Option<Metadata>,
) -> Self {
Expand All @@ -64,7 +65,7 @@ impl ReferenceSequence {
/// let reference_sequence = ReferenceSequence::new(Default::default(), Vec::new(), None);
/// assert!(reference_sequence.bins().is_empty());
/// ```
pub fn bins(&self) -> &HashMap<usize, Bin> {
pub fn bins(&self) -> &IndexMap<usize, Bin> {
&self.bins
}

Expand Down
13 changes: 4 additions & 9 deletions noodles-csi/src/index/reference_sequence/builder.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use std::collections::HashMap;

use indexmap::IndexMap;
use noodles_bgzf as bgzf;
use noodles_core::Position;

Expand All @@ -11,7 +10,7 @@ use super::{
/// A CSI reference sequence builder.
#[derive(Debug)]
pub struct Builder {
bin_builders: HashMap<usize, bin::Builder>,
bin_builders: IndexMap<usize, bin::Builder>,
linear_index: Vec<Option<bgzf::VirtualPosition>>,
start_position: bgzf::VirtualPosition,
end_position: bgzf::VirtualPosition,
Expand Down Expand Up @@ -128,7 +127,7 @@ impl Builder {
impl Default for Builder {
fn default() -> Self {
Self {
bin_builders: HashMap::new(),
bin_builders: IndexMap::new(),
linear_index: Vec::new(),
start_position: bgzf::VirtualPosition::MAX,
end_position: bgzf::VirtualPosition::MIN,
Expand Down Expand Up @@ -226,11 +225,7 @@ mod tests {
ReferenceSequence::new(bins, linear_index, Some(metadata))
};

for (id, expected_bin) in expected.bins() {
let actual_bin = actual.bins().get(id).expect("missing bin");
assert_eq!(actual_bin, expected_bin);
}

assert_eq!(actual.bins(), expected.bins());
assert_eq!(actual.linear_index(), expected.linear_index());
assert_eq!(actual.metadata(), expected.metadata());

Expand Down
8 changes: 4 additions & 4 deletions noodles-csi/src/reader.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
use std::{
collections::HashMap,
io::{self, Read},
str,
};

use byteorder::{LittleEndian, ReadBytesExt};
use indexmap::IndexMap;
use noodles_bgzf as bgzf;

use super::{
Expand Down Expand Up @@ -234,11 +234,11 @@ where
Ok(ReferenceSequence::new(bins, Vec::new(), metadata))
}

fn read_bins<R>(reader: &mut R, depth: u8) -> io::Result<(HashMap<usize, Bin>, Option<Metadata>)>
fn read_bins<R>(reader: &mut R, depth: u8) -> io::Result<(IndexMap<usize, Bin>, Option<Metadata>)>
where
R: Read,
{
fn duplicate_bin_error(id: usize) -> io::Result<(HashMap<usize, Bin>, Option<Metadata>)> {
fn duplicate_bin_error(id: usize) -> io::Result<(IndexMap<usize, Bin>, Option<Metadata>)> {
Err(io::Error::new(
io::ErrorKind::InvalidData,
format!("duplicate bin ID: {id}"),
Expand All @@ -249,7 +249,7 @@ where
usize::try_from(n).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))
})?;

let mut bins = HashMap::with_capacity(n_bin);
let mut bins = IndexMap::with_capacity(n_bin);

let metadata_id = Bin::metadata_id(depth);
let mut metadata = None;
Expand Down
8 changes: 3 additions & 5 deletions noodles-csi/src/writer.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
use std::{
collections::HashMap,
io::{self, Write},
};
use std::io::{self, Write};

use byteorder::{LittleEndian, WriteBytesExt};
use indexmap::IndexMap;
use noodles_bgzf as bgzf;

use super::{
Expand Down Expand Up @@ -183,7 +181,7 @@ where
fn write_bins<W>(
writer: &mut W,
depth: u8,
bins: &HashMap<usize, Bin>,
bins: &IndexMap<usize, Bin>,
metadata: Option<&Metadata>,
) -> io::Result<()>
where
Expand Down
1 change: 1 addition & 0 deletions noodles-tabix/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ async = ["dep:tokio", "noodles-bgzf/async"]
[dependencies]
bit-vec.workspace = true
byteorder.workspace = true
indexmap.workspace = true
noodles-bgzf = { path = "../noodles-bgzf", version = "0.25.0" }
noodles-core = { path = "../noodles-core", version = "0.12.0" }
noodles-csi = { path = "../noodles-csi", version = "0.25.1" }
Expand Down
7 changes: 3 additions & 4 deletions noodles-tabix/src/async/reader.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use std::collections::HashMap;

use indexmap::IndexMap;
use noodles_bgzf as bgzf;
use noodles_csi::index::{
header::{Format, ReferenceSequenceNames},
Expand Down Expand Up @@ -230,7 +229,7 @@ where
Ok(ReferenceSequence::new(bins, intervals, metadata))
}

async fn read_bins<R>(reader: &mut R) -> io::Result<(HashMap<usize, Bin>, Option<Metadata>)>
async fn read_bins<R>(reader: &mut R) -> io::Result<(IndexMap<usize, Bin>, Option<Metadata>)>
where
R: AsyncRead + Unpin,
{
Expand All @@ -242,7 +241,7 @@ where
usize::try_from(n).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))
})?;

let mut bins = HashMap::with_capacity(n_bin);
let mut bins = IndexMap::with_capacity(n_bin);
let mut metadata = None;

for _ in 0..n_bin {
Expand Down
5 changes: 2 additions & 3 deletions noodles-tabix/src/async/writer.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use std::collections::HashMap;

use indexmap::IndexMap;
use noodles_bgzf as bgzf;
use noodles_csi::index::{
header::ReferenceSequenceNames,
Expand Down Expand Up @@ -220,7 +219,7 @@ where

async fn write_bins<W>(
writer: &mut W,
bins: &HashMap<usize, Bin>,
bins: &IndexMap<usize, Bin>,
metadata: Option<&Metadata>,
) -> io::Result<()>
where
Expand Down
9 changes: 5 additions & 4 deletions noodles-tabix/src/reader.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
use std::{
collections::HashMap,
io::{self, Read},
str,
};

use super::MAGIC_NUMBER;
use byteorder::{LittleEndian, ReadBytesExt};
use indexmap::IndexMap;
use noodles_bgzf as bgzf;
use noodles_csi::{
index::{
Expand All @@ -16,6 +15,8 @@ use noodles_csi::{
Index,
};

use super::MAGIC_NUMBER;

const NUL: u8 = b'\x00';

/// A tabix reader.
Expand Down Expand Up @@ -236,7 +237,7 @@ where
Ok(references)
}

fn read_bins<R>(reader: &mut R) -> io::Result<(HashMap<usize, Bin>, Option<Metadata>)>
fn read_bins<R>(reader: &mut R) -> io::Result<(IndexMap<usize, Bin>, Option<Metadata>)>
where
R: Read,
{
Expand All @@ -248,7 +249,7 @@ where
usize::try_from(n).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))
})?;

let mut bins = HashMap::with_capacity(n_bin);
let mut bins = IndexMap::with_capacity(n_bin);
let mut metadata = None;

for _ in 0..n_bin {
Expand Down

0 comments on commit 89b9acd

Please sign in to comment.