diff --git a/noodles-bcf/CHANGELOG.md b/noodles-bcf/CHANGELOG.md index 1f573690a..3b15f2eea 100644 --- a/noodles-bcf/CHANGELOG.md +++ b/noodles-bcf/CHANGELOG.md @@ -1,5 +1,15 @@ # Changelog +## Unreleased + +### Changed + + * bcf/reader/header: Parse header line by line. + + The header parser can now build a `vcf::Header` and `StringMaps` by parsing + a raw header line by line. This makes it so that it is no longer required + to read the entire raw header into memory before parsing. + ## 0.41.0 - 2023-10-26 ### Changed diff --git a/noodles-bcf/src/header/string_maps.rs b/noodles-bcf/src/header/string_maps.rs index b8dcbb5aa..f45aa1f97 100644 --- a/noodles-bcf/src/header/string_maps.rs +++ b/noodles-bcf/src/header/string_maps.rs @@ -6,7 +6,10 @@ use std::str::{FromStr, Lines}; use noodles_vcf::{ self as vcf, - header::{parser::parse_record, ParseError, Record}, + header::{ + parser::{parse_record, Entry}, + ParseError, Record, + }, }; pub use self::string_map::StringMap; @@ -98,6 +101,16 @@ impl StringMaps { fn contigs_mut(&mut self) -> &mut ContigStringMap { &mut self.contig_string_map } + + pub(crate) fn insert_entry(&mut self, entry: &Entry<'_>) -> Result<(), ParseError> { + match entry { + Entry::Contig(id, contig) => insert(self.contigs_mut(), id.as_ref(), contig.idx()), + Entry::Filter(id, filter) => insert(self.strings_mut(), id, filter.idx()), + Entry::Format(id, format) => insert(self.strings_mut(), id.as_ref(), format.idx()), + Entry::Info(id, info) => insert(self.strings_mut(), id.as_ref(), info.idx()), + _ => Ok(()), + } + } } impl Default for StringMaps { diff --git a/noodles-bcf/src/reader/header.rs b/noodles-bcf/src/reader/header.rs index 03dedb7c5..43208641f 100644 --- a/noodles-bcf/src/reader/header.rs +++ b/noodles-bcf/src/reader/header.rs @@ -1,7 +1,4 @@ -use std::{ - ffi::CStr, - io::{self, Read}, -}; +use std::io::{self, BufRead, BufReader, Read}; use byteorder::{LittleEndian, ReadBytesExt}; use noodles_vcf as vcf; @@ -12,38 +9,60 @@ pub(super) fn read_header(reader: &mut R) -> io::Result<(vcf::Header, StringM where R: Read, { - let raw_header = read_raw_header(reader)?; + const NUL: u8 = 0x00; - let header = raw_header - .parse() - .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + let l_text = reader.read_u32::().map(u64::from)?; + + let mut parser = vcf::header::Parser::default(); + let mut string_maps = StringMaps::default(); + + let mut header_reader = BufReader::new(reader.take(l_text)); + let mut buf = Vec::new(); + + while read_line(&mut header_reader, &mut buf)? != 0 { + if buf == [NUL] { + break; + } - let string_maps = raw_header - .parse() + let entry = parser + .parse_partial(&buf) + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + + string_maps + .insert_entry(&entry) + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + } + + let header = parser + .finish() .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; Ok((header, string_maps)) } -pub fn read_raw_header(reader: &mut R) -> io::Result +fn read_line(reader: &mut R, dst: &mut Vec) -> io::Result where - R: Read, + R: BufRead, { - let l_text = reader.read_u32::().and_then(|n| { - usize::try_from(n).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e)) - })?; - - let mut buf = vec![0; l_text]; - reader.read_exact(&mut buf)?; - - CStr::from_bytes_with_nul(&buf) - .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e)) - .and_then(|c_header| { - c_header - .to_str() - .map(|s| s.into()) - .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e)) - }) + const LINE_FEED: u8 = b'\n'; + const CARRIAGE_RETURN: u8 = b'\r'; + + dst.clear(); + + match reader.read_until(LINE_FEED, dst)? { + 0 => Ok(0), + n => { + if dst.ends_with(&[LINE_FEED]) { + dst.pop(); + + if dst.ends_with(&[CARRIAGE_RETURN]) { + dst.pop(); + } + } + + Ok(n) + } + } } #[cfg(test)] @@ -51,19 +70,30 @@ mod tests { use super::*; #[test] - fn test_read_raw_header() -> io::Result<()> { + fn test_read_header() -> io::Result<()> { + use vcf::header::FileFormat; + const NUL: u8 = 0x00; - let raw_header = "##fileformat=VCFv4.3\n"; + let raw_header = b"##fileformat=VCFv4.3 +#CHROM POS ID REF ALT QUAL FILTER INFO +"; - let mut data = 22u32.to_le_bytes().to_vec(); // l_text = 22 - data.extend_from_slice(raw_header.as_bytes()); + let mut data = 60u32.to_le_bytes().to_vec(); // l_text = 22 + data.extend_from_slice(raw_header); data.push(NUL); let mut reader = &data[..]; - let actual = read_raw_header(&mut reader)?; + let (actual_header, actual_string_maps) = read_header(&mut reader)?; + + let expected_header = vcf::Header::builder() + .set_file_format(FileFormat::new(4, 3)) + .build(); + + let expected_string_maps = StringMaps::default(); - assert_eq!(actual, raw_header); + assert_eq!(actual_header, expected_header); + assert_eq!(actual_string_maps, expected_string_maps); Ok(()) }