Skip to content

Commit

Permalink
bcf/reader/header: Parse header line by line
Browse files Browse the repository at this point in the history
The header parser can now build a `vcf::Header` and `StringMaps` by
parsing a raw header line by line. This makes it so that it is no longer
required to read the entire header into memory before parsing.
  • Loading branch information
zaeleus committed Nov 2, 2023
1 parent 2688c15 commit 5fa37dd
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 34 deletions.
10 changes: 10 additions & 0 deletions noodles-bcf/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
# Changelog

## Unreleased

### Changed

* bcf/reader/header: Parse header line by line.

The header parser can now build a `vcf::Header` and `StringMaps` by parsing
a raw header line by line. This makes it so that it is no longer required
to read the entire raw header into memory before parsing.

## 0.41.0 - 2023-10-26

### Changed
Expand Down
15 changes: 14 additions & 1 deletion noodles-bcf/src/header/string_maps.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@ use std::str::{FromStr, Lines};

use noodles_vcf::{
self as vcf,
header::{parser::parse_record, ParseError, Record},
header::{
parser::{parse_record, Entry},
ParseError, Record,
},
};

pub use self::string_map::StringMap;
Expand Down Expand Up @@ -98,6 +101,16 @@ impl StringMaps {
fn contigs_mut(&mut self) -> &mut ContigStringMap {
&mut self.contig_string_map
}

pub(crate) fn insert_entry(&mut self, entry: &Entry<'_>) -> Result<(), ParseError> {
match entry {
Entry::Contig(id, contig) => insert(self.contigs_mut(), id.as_ref(), contig.idx()),
Entry::Filter(id, filter) => insert(self.strings_mut(), id, filter.idx()),
Entry::Format(id, format) => insert(self.strings_mut(), id.as_ref(), format.idx()),
Entry::Info(id, info) => insert(self.strings_mut(), id.as_ref(), info.idx()),
_ => Ok(()),
}
}
}

impl Default for StringMaps {
Expand Down
96 changes: 63 additions & 33 deletions noodles-bcf/src/reader/header.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
use std::{
ffi::CStr,
io::{self, Read},
};
use std::io::{self, BufRead, BufReader, Read};

use byteorder::{LittleEndian, ReadBytesExt};
use noodles_vcf as vcf;
Expand All @@ -12,58 +9,91 @@ pub(super) fn read_header<R>(reader: &mut R) -> io::Result<(vcf::Header, StringM
where
R: Read,
{
let raw_header = read_raw_header(reader)?;
const NUL: u8 = 0x00;

let header = raw_header
.parse()
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
let l_text = reader.read_u32::<LittleEndian>().map(u64::from)?;

let mut parser = vcf::header::Parser::default();
let mut string_maps = StringMaps::default();

let mut header_reader = BufReader::new(reader.take(l_text));
let mut buf = Vec::new();

while read_line(&mut header_reader, &mut buf)? != 0 {
if buf == [NUL] {
break;
}

let string_maps = raw_header
.parse()
let entry = parser
.parse_partial(&buf)
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;

string_maps
.insert_entry(&entry)
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
}

let header = parser
.finish()
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;

Ok((header, string_maps))
}

pub fn read_raw_header<R>(reader: &mut R) -> io::Result<String>
fn read_line<R>(reader: &mut R, dst: &mut Vec<u8>) -> io::Result<usize>
where
R: Read,
R: BufRead,
{
let l_text = reader.read_u32::<LittleEndian>().and_then(|n| {
usize::try_from(n).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))
})?;

let mut buf = vec![0; l_text];
reader.read_exact(&mut buf)?;

CStr::from_bytes_with_nul(&buf)
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))
.and_then(|c_header| {
c_header
.to_str()
.map(|s| s.into())
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))
})
const LINE_FEED: u8 = b'\n';
const CARRIAGE_RETURN: u8 = b'\r';

dst.clear();

match reader.read_until(LINE_FEED, dst)? {
0 => Ok(0),
n => {
if dst.ends_with(&[LINE_FEED]) {
dst.pop();

if dst.ends_with(&[CARRIAGE_RETURN]) {
dst.pop();
}
}

Ok(n)
}
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_read_raw_header() -> io::Result<()> {
fn test_read_header() -> io::Result<()> {
use vcf::header::FileFormat;

const NUL: u8 = 0x00;

let raw_header = "##fileformat=VCFv4.3\n";
let raw_header = b"##fileformat=VCFv4.3
#CHROM POS ID REF ALT QUAL FILTER INFO
";

let mut data = 22u32.to_le_bytes().to_vec(); // l_text = 22
data.extend_from_slice(raw_header.as_bytes());
let mut data = 60u32.to_le_bytes().to_vec(); // l_text = 22
data.extend_from_slice(raw_header);
data.push(NUL);

let mut reader = &data[..];
let actual = read_raw_header(&mut reader)?;
let (actual_header, actual_string_maps) = read_header(&mut reader)?;

let expected_header = vcf::Header::builder()
.set_file_format(FileFormat::new(4, 3))
.build();

let expected_string_maps = StringMaps::default();

assert_eq!(actual, raw_header);
assert_eq!(actual_header, expected_header);
assert_eq!(actual_string_maps, expected_string_maps);

Ok(())
}
Expand Down

0 comments on commit 5fa37dd

Please sign in to comment.