From 683c84d0dcc17cae659bebb14120f23f7bf2ff4d Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Wed, 8 May 2024 11:22:53 +0200 Subject: [PATCH] fix: update noodles group (#447) * update noodles group * update insta snapshot for test_multiquery --- Cargo.lock | 52 +- Cargo.toml | 9 +- src/common/keys.rs | 16 +- src/common/noodles.rs | 58 +- src/dbsnp/cli/import.rs | 9 +- src/dbsnp/pbs.rs | 24 +- src/freqs/cli/import/auto.rs | 13 +- src/freqs/cli/import/mod.rs | 21 +- src/freqs/cli/import/mt.rs | 13 +- src/freqs/cli/import/reading.rs | 68 +- ...li__import__reading__test__multiquery.snap | 1046 ++++------------- src/freqs/cli/import/xy.rs | 13 +- src/freqs/serialized/auto.rs | 3 +- src/freqs/serialized/mt.rs | 3 +- src/freqs/serialized/xy.rs | 3 +- src/gnomad_mtdna/cli/import.rs | 5 +- src/gnomad_nuclear/cli/import.rs | 7 +- src/gnomad_sv/cli/import/gnomad_cnv4.rs | 22 +- src/gnomad_sv/cli/import/gnomad_sv2.rs | 62 +- src/gnomad_sv/cli/import/gnomad_sv4.rs | 69 +- src/helixmtdb/cli/import.rs | 5 +- src/helixmtdb/pbs.rs | 85 +- src/pbs/gnomad/gnomad2.rs | 68 +- src/pbs/gnomad/gnomad3.rs | 50 +- src/pbs/gnomad/gnomad4.rs | 45 +- src/pbs/gnomad/mtdna.rs | 48 +- 26 files changed, 673 insertions(+), 1144 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 41eed540..1a75a021 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -270,11 +270,12 @@ dependencies = [ "indexmap", "indicatif", "insta", + "itertools", "log", "noodles-bed", - "noodles-bgzf", + "noodles-bgzf 0.29.0", "noodles-core", - "noodles-csi", + "noodles-csi 0.33.0", "noodles-gff", "noodles-tabix", "noodles-vcf", @@ -1699,6 +1700,18 @@ dependencies = [ "flate2", ] +[[package]] +name = "noodles-bgzf" +version = "0.29.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dba1c82e9f92c00b23538359e5d191dff7ccb300cf659ee3a835af65c3cd143" +dependencies = [ + "byteorder", + "bytes", + "crossbeam-channel", + "flate2", +] + [[package]] name = "noodles-core" version = "0.14.0" @@ -1714,7 +1727,20 @@ dependencies = [ "bit-vec", "byteorder", "indexmap", - "noodles-bgzf", + "noodles-bgzf 0.26.0", + "noodles-core", +] + +[[package]] +name = "noodles-csi" +version = "0.33.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "857fcce273172c7ec188369c0e745e95aefc477a67262133862c9ef3375e56b9" +dependencies = [ + "bit-vec", + "byteorder", + "indexmap", + "noodles-bgzf 0.29.0", "noodles-core", ] @@ -1725,37 +1751,37 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "14f8ec87fe3630f57d6d8ea24cbc2cbd0bfed1fe66238bda7a7c3fb6a36d3713" dependencies = [ "indexmap", - "noodles-bgzf", + "noodles-bgzf 0.26.0", "noodles-core", - "noodles-csi", + "noodles-csi 0.30.0", "percent-encoding", ] [[package]] name = "noodles-tabix" -version = "0.36.0" +version = "0.39.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc1ab29335a68d0c2bdf41460a67714ca69e23a1cbeb950ac5c38a9afa446a62" +checksum = "778b5a8cfbe846c7239cdd9f3ba7b9e46a4d46d957822051ae16680713bbd028" dependencies = [ "bit-vec", "byteorder", "indexmap", - "noodles-bgzf", + "noodles-bgzf 0.29.0", "noodles-core", - "noodles-csi", + "noodles-csi 0.33.0", ] [[package]] name = "noodles-vcf" -version = "0.49.0" +version = "0.55.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e1f2fa749afaccadc596ec55ccb7bdcd8101fa79f8382384223c0dbae3e245b" +checksum = "ad890dd39fe923d22112ed0c90ed9835b2c196d6a4e711b20a007852ed63ced9" dependencies = [ "indexmap", "memchr", - "noodles-bgzf", + "noodles-bgzf 0.29.0", "noodles-core", - "noodles-csi", + "noodles-csi 0.33.0", "noodles-tabix", "percent-encoding", ] diff --git a/Cargo.toml b/Cargo.toml index 1a0e5dfc..3e8c2cae 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -37,11 +37,11 @@ indexmap = { version = "2.2", features = ["serde"] } indicatif = { version = "0.17", features = ["rayon"] } log = "0.4" noodles-bed = "0.12" -noodles-bgzf = "0.26" +noodles-bgzf = "0.29" noodles-core = "0.14" -noodles-csi = "0.30" -noodles-tabix = "0.36" -noodles-vcf = "0.49" +noodles-csi = "0.33" +noodles-tabix = "0.39" +noodles-vcf = "0.55" pbjson = "0.6" pbjson-types = "0.6" prost = "0.12" @@ -58,6 +58,7 @@ tracing-subscriber = "0.3" rustc-hash = "1.1.0" noodles-gff = "0.27.0" erased-serde = "0.4.2" +itertools = "0.11.0" [build-dependencies] prost-build = "0.12" diff --git a/src/common/keys.rs b/src/common/keys.rs index 53a6ff02..2ee689e8 100644 --- a/src/common/keys.rs +++ b/src/common/keys.rs @@ -92,19 +92,19 @@ impl Var { } /// Create for all alternate alleles from the given VCF record. - pub fn from_vcf_allele(value: &noodles_vcf::Record, allele_no: usize) -> Self { - let chrom = match value.chromosome() { - noodles_vcf::record::Chromosome::Name(name) - | noodles_vcf::record::Chromosome::Symbol(name) => name.to_owned(), - }; - let pos: usize = value.position().into(); - let pos = pos as i32; + pub fn from_vcf_allele(value: &noodles_vcf::variant::RecordBuf, allele_no: usize) -> Self { + let chrom = value.reference_sequence_name().to_string(); + let pos: usize = value + .variant_start() + .expect("Telomeric breakends not supported") + .get(); + let pos = i32::try_from(pos).unwrap(); let reference = value.reference_bases().to_string(); Var { chrom, pos, reference, - alternative: value.alternate_bases()[allele_no].to_string(), + alternative: value.alternate_bases().as_ref()[allele_no].to_string(), } } } diff --git a/src/common/noodles.rs b/src/common/noodles.rs index 1b88f33c..ecc37fbf 100644 --- a/src/common/noodles.rs +++ b/src/common/noodles.rs @@ -2,14 +2,17 @@ use std::str::FromStr; -use noodles_vcf::record::info::field; +use noodles_vcf::variant::record_buf::info::field; /// Extract a `String` field from a record. -pub fn get_string(record: &noodles_vcf::Record, name: &str) -> Result { - if let Some(Some(field::Value::String(v))) = record.info().get(&field::Key::from_str(name)?) { +pub fn get_string( + record: &noodles_vcf::variant::RecordBuf, + name: &str, +) -> Result { + if let Some(Some(field::Value::String(v))) = record.info().get(name) { Ok(v.to_string()) } else if let Some(Some(field::Value::Array(field::value::Array::String(vs)))) = - record.info().get(&field::Key::from_str(name)?) + record.info().get(name) { Ok(vs.first().unwrap().as_ref().unwrap().to_string()) } else { @@ -18,19 +21,22 @@ pub fn get_string(record: &noodles_vcf::Record, name: &str) -> Result Result { +pub fn get_flag( + record: &noodles_vcf::variant::RecordBuf, + name: &str, +) -> Result { Ok(matches!( - record.info().get(&field::Key::from_str(name)?), + record.info().get(name), Some(Some(field::Value::Flag)) )) } /// Extract an `i32` field from a record. -pub fn get_i32(record: &noodles_vcf::Record, name: &str) -> Result { - if let Some(Some(field::Value::Integer(v))) = record.info().get(&field::Key::from_str(name)?) { +pub fn get_i32(record: &noodles_vcf::variant::RecordBuf, name: &str) -> Result { + if let Some(Some(field::Value::Integer(v))) = record.info().get(name) { Ok(*v) } else if let Some(Some(field::Value::Array(field::value::Array::Integer(vs)))) = - record.info().get(&field::Key::from_str(name)?) + record.info().get(name) { Ok(vs.first().unwrap().unwrap()) } else { @@ -39,11 +45,11 @@ pub fn get_i32(record: &noodles_vcf::Record, name: &str) -> Result Result { - if let Some(Some(field::Value::Float(v))) = record.info().get(&field::Key::from_str(name)?) { +pub fn get_f32(record: &noodles_vcf::variant::RecordBuf, name: &str) -> Result { + if let Some(Some(field::Value::Float(v))) = record.info().get(name) { Ok(*v) } else if let Some(Some(field::Value::Array(field::value::Array::Float(vs)))) = - record.info().get(&field::Key::from_str(name)?) + record.info().get(name) { Ok(vs.first().unwrap().unwrap()) } else { @@ -54,9 +60,12 @@ pub fn get_f32(record: &noodles_vcf::Record, name: &str) -> Result` field from record with an array field. /// /// This is different than parsing the histograms from pipe-separated strings. -pub fn get_vec_str(record: &noodles_vcf::Record, name: &str) -> Result, anyhow::Error> { +pub fn get_vec_str( + record: &noodles_vcf::variant::RecordBuf, + name: &str, +) -> Result, anyhow::Error> { if let Some(Some(field::Value::Array(field::value::Array::String(vs)))) = - record.info().get(&field::Key::from_str(name)?) + record.info().get(name) { Ok(vs.iter().flatten().cloned().collect()) } else { @@ -67,9 +76,12 @@ pub fn get_vec_str(record: &noodles_vcf::Record, name: &str) -> Result` field from record with an array field. /// /// This is different than parsing the histograms from pipe-separated strings. -pub fn get_vec_i32(record: &noodles_vcf::Record, name: &str) -> Result, anyhow::Error> { +pub fn get_vec_i32( + record: &noodles_vcf::variant::RecordBuf, + name: &str, +) -> Result, anyhow::Error> { if let Some(Some(field::Value::Array(field::value::Array::Integer(vs)))) = - record.info().get(&field::Key::from_str(name)?) + record.info().get(name) { Ok(vs.iter().flatten().cloned().collect()) } else { @@ -78,11 +90,14 @@ pub fn get_vec_i32(record: &noodles_vcf::Record, name: &str) -> Result, } /// Extract an `Vec` field from a record encoded as a pipe symbol separated string. -pub fn get_vec(record: &noodles_vcf::Record, name: &str) -> Result, anyhow::Error> +pub fn get_vec( + record: &noodles_vcf::variant::RecordBuf, + name: &str, +) -> Result, anyhow::Error> where T: FromStr, { - if let Some(Some(field::Value::String(v))) = record.info().get(&field::Key::from_str(name)?) { + if let Some(Some(field::Value::String(v))) = record.info().get(name) { v.split('|') .map(|s| s.parse()) .collect::, _>>() @@ -94,12 +109,15 @@ where /// Extract an `Vec>` field from a record encoded as a list of pipe symbol /// separated string. -pub fn get_vec_vec(record: &noodles_vcf::Record, name: &str) -> Result, anyhow::Error> +pub fn get_vec_vec( + record: &noodles_vcf::variant::RecordBuf, + name: &str, +) -> Result, anyhow::Error> where T: FromStr, { if let Some(Some(field::Value::Array(field::value::Array::String(value)))) = - record.info().get(&field::Key::from_str(name)?) + record.info().get(name) { Ok(value .iter() diff --git a/src/dbsnp/cli/import.rs b/src/dbsnp/cli/import.rs index 10665c77..5277d5ff 100644 --- a/src/dbsnp/cli/import.rs +++ b/src/dbsnp/cli/import.rs @@ -8,6 +8,7 @@ use clap::Parser; use indicatif::ParallelProgressIterator; use noodles_csi::BinningIndex as _; use noodles_vcf::header::record; +use noodles_vcf::variant::RecordBuf; use prost::Message; use rayon::prelude::{IntoParallelRefIterator, ParallelIterator}; @@ -105,7 +106,7 @@ fn process_window( let cf_dbsnp = db.cf_handle(&args.cf_name).unwrap(); let cf_dbsnp_by_rsid = db.cf_handle(&args.cf_name_by_rsid).unwrap(); let mut reader = - noodles_vcf::indexed_reader::Builder::default().build_from_path(&args.path_in_vcf)?; + noodles_vcf::io::indexed_reader::Builder::default().build_from_path(&args.path_in_vcf)?; let header = reader.read_header()?; let raw_region = format!("{}:{}-{}", chrom, begin + 1, end); @@ -130,10 +131,10 @@ fn process_window( // exist). if let Some(query) = query { for result in query { - let vcf_record = result?; + let vcf_record = RecordBuf::try_from_variant_record(&header, &result?)?; // Process each alternate allele into one record. - for allele_no in 0..vcf_record.alternate_bases().len() { + for allele_no in 0..vcf_record.alternate_bases().as_ref().len() { let key_buf: Vec = common::keys::Var::from_vcf_allele(&vcf_record, allele_no).into(); let record = dbsnp::pbs::Record::from_vcf_allele(&vcf_record, allele_no)?; @@ -158,7 +159,7 @@ pub fn run(common: &common::cli::Args, args: &Args) -> Result<(), anyhow::Error> tracing::info!("Opening dbSNP VCF file..."); let before_loading = std::time::Instant::now(); let mut reader_vcf = - noodles_vcf::indexed_reader::Builder::default().build_from_path(&args.path_in_vcf)?; + noodles_vcf::io::indexed_reader::Builder::default().build_from_path(&args.path_in_vcf)?; let header = reader_vcf.read_header()?; let dbsnp_reference = if let record::value::Collection::Unstructured(values) = header .other_records() diff --git a/src/dbsnp/pbs.rs b/src/dbsnp/pbs.rs index 8ffb4341..d4a5e524 100644 --- a/src/dbsnp/pbs.rs +++ b/src/dbsnp/pbs.rs @@ -1,28 +1,30 @@ //! Data structures for (de-)serialization as generated by `prost-build`. -use std::str::FromStr; +use noodles_vcf::variant::record::AlternateBases; pub use crate::pbs::dbsnp::Record; -use noodles_vcf::record::info::field; +use noodles_vcf::variant::record_buf::info::field; impl Record { /// Creates a new `Record` from a VCF record and allele number. pub fn from_vcf_allele( - record: &noodles_vcf::record::Record, + record: &noodles_vcf::variant::RecordBuf, allele_no: usize, ) -> Result { - let chrom = record.chromosome().to_string(); - let pos: usize = record.position().into(); - let pos: i32 = pos.try_into()?; + let chrom = record.reference_sequence_name().to_string(); + let pos: usize = record + .variant_start() + .expect("Telomeric breakends not supported") + .get(); + let pos: i32 = i32::try_from(pos)?; let ref_allele = record.reference_bases().to_string(); let alt_allele = record .alternate_bases() - .get(allele_no) - .ok_or_else(|| anyhow::anyhow!("no such allele: {}", allele_no))? + .iter() + .nth(allele_no) + .ok_or_else(|| anyhow::anyhow!("no such allele: {}", allele_no))?? .to_string(); - let rs_id = if let Some(Some(field::Value::Integer(rs))) = - record.info().get(&field::Key::from_str("RS")?) - { + let rs_id = if let Some(Some(field::Value::Integer(rs))) = record.info().get("RS") { *rs } else { anyhow::bail!("no rs id in dbSNP record") diff --git a/src/freqs/cli/import/auto.rs b/src/freqs/cli/import/auto.rs index 3000366f..d7e854f1 100644 --- a/src/freqs/cli/import/auto.rs +++ b/src/freqs/cli/import/auto.rs @@ -7,8 +7,8 @@ fn write_record( db: &rocksdb::DBWithThreadMode, cf: &std::sync::Arc, record_key: &common::keys::Var, - record_genome: &mut Option, - record_exome: &mut Option, + record_genome: &mut Option, + record_exome: &mut Option, ) -> Result<(), anyhow::Error> { if record_genome.is_none() && record_exome.is_none() { // Early exit, nothing to write out. @@ -56,11 +56,14 @@ pub fn import_region( let mut readers = Vec::new(); if let Some(path_genome) = path_genome { is_genome.push(true); - readers.push(noodles_vcf::indexed_reader::Builder::default().build_from_path(path_genome)?); + readers.push( + noodles_vcf::io::indexed_reader::Builder::default().build_from_path(path_genome)?, + ); } if let Some(path_exome) = path_exome { is_genome.push(false); - readers.push(noodles_vcf::indexed_reader::Builder::default().build_from_path(path_exome)?); + readers + .push(noodles_vcf::io::indexed_reader::Builder::default().build_from_path(path_exome)?); } // Read headers. let headers: Vec<_> = readers @@ -75,7 +78,7 @@ pub fn import_region( .map(|(reader, header)| reader.query(header, region)) .collect::>()?; // Construct the `MultiQuery`. - let multi_query = super::reading::MultiQuery::new(queries)?; + let multi_query = super::reading::MultiQuery::new(queries, &headers)?; // Now iterate over the `MultiQuery` and write to the database. // diff --git a/src/freqs/cli/import/mod.rs b/src/freqs/cli/import/mod.rs index 602ca69d..a9d74b1b 100644 --- a/src/freqs/cli/import/mod.rs +++ b/src/freqs/cli/import/mod.rs @@ -76,21 +76,24 @@ fn assign_to_chrom( let mut res = HashMap::new(); for path in paths { - let mut reader = noodles_vcf::indexed_reader::Builder::default().build_from_path(path)?; + let mut reader = + noodles_vcf::io::indexed_reader::Builder::default().build_from_path(path)?; let header = Box::new(reader.read_header()?); freqs::cli::import::reading::guess_assembly(header.as_ref(), true, Some(assembly))?; let record = reader - .records(header.as_ref()) + .record_bufs(header.as_ref()) .next() .transpose()? .ok_or(anyhow::anyhow!("No records in VCF file {}", path))?; - let k = contig_map.chrom_to_idx(record.chromosome()).map_err(|e| { - anyhow::anyhow!( - "Error mapping chromosome {} to index: {}", - record.chromosome(), - e - ) - })?; + let k = contig_map + .chrom_to_idx(record.reference_sequence_name()) + .map_err(|e| { + anyhow::anyhow!( + "Error mapping chromosome {} to index: {}", + record.reference_sequence_name(), + e + ) + })?; let v = path.clone(); res.insert(k, v); } diff --git a/src/freqs/cli/import/mt.rs b/src/freqs/cli/import/mt.rs index 32ef54ca..ec8028b6 100644 --- a/src/freqs/cli/import/mt.rs +++ b/src/freqs/cli/import/mt.rs @@ -7,8 +7,8 @@ fn write_record( db: &rocksdb::DBWithThreadMode, cf: &std::sync::Arc, record_key: &common::keys::Var, - record_gnomad: &mut Option, - record_helix: &mut Option, + record_gnomad: &mut Option, + record_helix: &mut Option, ) -> Result<(), anyhow::Error> { if record_gnomad.is_none() && record_helix.is_none() { // Early exit, nothing to write out. @@ -59,12 +59,15 @@ pub fn import_region( if let Some(path_gnomad) = path_gnomad { is_gnomad.push(true); paths.push(path_gnomad); - readers.push(noodles_vcf::indexed_reader::Builder::default().build_from_path(path_gnomad)?); + readers.push( + noodles_vcf::io::indexed_reader::Builder::default().build_from_path(path_gnomad)?, + ); } if let Some(path_helix) = path_helix { is_gnomad.push(false); paths.push(path_helix); - readers.push(noodles_vcf::indexed_reader::Builder::default().build_from_path(path_helix)?); + readers + .push(noodles_vcf::io::indexed_reader::Builder::default().build_from_path(path_helix)?); } // Read headers. let headers: Vec<_> = readers @@ -92,7 +95,7 @@ pub fn import_region( }) .collect::>()?; // Construct the `MultiQuery`. - let multi_query = super::reading::MultiQuery::new(queries)?; + let multi_query = super::reading::MultiQuery::new(queries, &headers)?; // Now iterate over the `MultiQuery` and write to the database. // diff --git a/src/freqs/cli/import/reading.rs b/src/freqs/cli/import/reading.rs index 6b482633..b1ba1aed 100644 --- a/src/freqs/cli/import/reading.rs +++ b/src/freqs/cli/import/reading.rs @@ -3,6 +3,9 @@ use std::collections::{BTreeMap, HashMap}; use biocommons_bioutils::assemblies::{Assembly, Sequence, ASSEMBLY_INFOS}; +use noodles_vcf::variant::record::AlternateBases; +use noodles_vcf::variant::RecordBuf; +use noodles_vcf::Header; use crate::common::cli::CANONICAL; @@ -42,14 +45,8 @@ impl ContigMap { } /// Map chromosome to index. - pub fn chrom_to_idx( - &self, - chrom: &noodles_vcf::record::Chromosome, - ) -> Result { - match chrom { - noodles_vcf::record::Chromosome::Name(s) - | noodles_vcf::record::Chromosome::Symbol(s) => self.chrom_name_to_idx(s), - } + pub fn chrom_to_idx(&self, chrom: &str) -> Result { + self.chrom_name_to_idx(chrom) } /// Map chromosome name to index. @@ -72,7 +69,7 @@ struct Key { /// Chromosome. chrom: String, /// Noodles position. - pos: noodles_vcf::record::Position, + pos: noodles_core::Position, /// Reference allele. reference: String, /// First (and only) alternate allelele. @@ -82,14 +79,18 @@ struct Key { } /// Build a key from a VCF record. -fn build_key(record: &noodles_vcf::Record, i: usize) -> Key { +fn build_key(record: &RecordBuf, i: usize) -> Key { Key { - chrom: record.chromosome().to_string(), - pos: record.position(), + chrom: record.reference_sequence_name().to_string(), + pos: record + .variant_start() + .expect("Telomeric breakends not supported"), reference: record.reference_bases().to_string(), alternative: record .alternate_bases() - .first() + .iter() + .next() + .expect("must have alternate allele") .expect("must have alternate allele") .to_string(), idx: i, @@ -99,27 +100,33 @@ fn build_key(record: &noodles_vcf::Record, i: usize) -> Key { /// Read through multiple `noodles_vcf::vcf::reader::Query`s at once. pub struct MultiQuery<'r, 'h, R> where - R: std::io::Read + std::io::Seek, + R: std::io::Read + noodles_bgzf::io::Seek, { /// One query for each input file. - queries: Vec>, + queries: Vec>, + + /// One header for each input file. (Not accessible from Query) + headers: Vec
, + /// The current smallest-by-coordinate records. - records: BTreeMap, + records: BTreeMap, } impl<'r, 'h, R> MultiQuery<'r, 'h, R> where - R: std::io::Read + std::io::Seek, + R: noodles_bgzf::io::BufRead + noodles_bgzf::io::Seek, { /// Construct a new `MultiQuery`. pub fn new( - mut record_iters: Vec>, + mut record_iters: Vec>, + headers: &[Header], ) -> std::io::Result { let mut records = BTreeMap::new(); - for (i, iter) in record_iters.iter_mut().enumerate() { + for (i, (iter, header)) in record_iters.iter_mut().zip(headers).enumerate() { if let Some(result) = iter.next() { let record = result?; + let record = RecordBuf::try_from_variant_record(header, &record)?; let key = build_key(&record, i); records.insert(key, record); } @@ -127,6 +134,7 @@ where Ok(Self { queries: record_iters, + headers: headers.to_vec(), records, }) } @@ -134,9 +142,9 @@ where impl<'r, 'h, R> Iterator for MultiQuery<'r, 'h, R> where - R: std::io::Read + std::io::Seek, + R: noodles_bgzf::io::BufRead + noodles_bgzf::io::Seek, { - type Item = std::io::Result<(usize, noodles_vcf::Record)>; + type Item = std::io::Result<(usize, RecordBuf)>; /// Return next item if any. fn next(&mut self) -> Option { @@ -145,6 +153,8 @@ where if let Some(result) = self.queries[idx].next() { match result { Ok(record) => { + let record = + RecordBuf::try_from_variant_record(&self.headers[idx], &record).ok()?; let key = build_key(&record, idx); self.records.insert(key, record); } @@ -193,7 +203,7 @@ pub fn guess_assembly( let mut compatible = 0; for (name, data) in vcf_header.contigs() { if let Some(length) = data.length() { - let idx = contig_map.name_map.get(name.as_ref()); + let idx = contig_map.name_map.get(name); if let Some(idx) = idx { let name = &info.sequences[*idx].name; if CANONICAL.contains(&name.as_ref()) { @@ -260,7 +270,7 @@ mod test { #[test] fn guess_assembly_helix_chrmt_ambiguous_ok_initial_none() -> Result<(), anyhow::Error> { let path = "tests/freqs/grch37/v2.1/reading/helix.chrM.vcf"; - let mut reader = noodles_vcf::reader::Builder::default().build_from_path(path)?; + let mut reader = noodles_vcf::io::reader::Builder::default().build_from_path(path)?; let header = reader.read_header()?; let actual = guess_assembly(&header, true, None)?; @@ -272,7 +282,7 @@ mod test { #[test] fn guess_assembly_helix_chrmt_ambiguous_ok_initial_override() -> Result<(), anyhow::Error> { let path = "tests/freqs/grch37/v2.1/reading/helix.chrM.vcf"; - let mut reader = noodles_vcf::reader::Builder::default().build_from_path(path)?; + let mut reader = noodles_vcf::io::reader::Builder::default().build_from_path(path)?; let header = reader.read_header()?; let actual = guess_assembly(&header, true, Some(Assembly::Grch37p10))?; @@ -285,7 +295,7 @@ mod test { fn guess_assembly_helix_chrmt_ambiguous_ok_initial_override_fails() -> Result<(), anyhow::Error> { let path = "tests/freqs/grch37/v2.1/reading/helix.chrM.vcf"; - let mut reader = noodles_vcf::reader::Builder::default().build_from_path(path)?; + let mut reader = noodles_vcf::io::reader::Builder::default().build_from_path(path)?; let header = reader.read_header()?; assert!(guess_assembly(&header, false, Some(Assembly::Grch37)).is_err()); @@ -296,7 +306,7 @@ mod test { #[test] fn guess_assembly_helix_chrmt_ambiguous_fail() -> Result<(), anyhow::Error> { let path = "tests/freqs/grch37/v2.1/reading/helix.chrM.vcf"; - let mut reader = noodles_vcf::reader::Builder::default().build_from_path(path)?; + let mut reader = noodles_vcf::io::reader::Builder::default().build_from_path(path)?; let header = reader.read_header()?; assert!(guess_assembly(&header, false, None).is_err()); @@ -313,9 +323,9 @@ mod test { #[test] fn test_multiquery() -> Result<(), anyhow::Error> { let mut readers = vec![ - noodles_vcf::indexed_reader::Builder::default() + noodles_vcf::io::indexed_reader::Builder::default() .build_from_path("tests/freqs/grch37/v2.1/reading/gnomad.chrM.vcf.bgz")?, - noodles_vcf::indexed_reader::Builder::default() + noodles_vcf::io::indexed_reader::Builder::default() .build_from_path("tests/freqs/grch37/v2.1/reading/helix.chrM.vcf.bgz")?, ]; @@ -334,7 +344,7 @@ mod test { .map(|(reader, header)| reader.query(header, ®ion)) .collect::>()?; - let multi_query = MultiQuery::new(queries)?; + let multi_query = MultiQuery::new(queries, &headers)?; let mut records = Vec::new(); for result in multi_query { diff --git a/src/freqs/cli/import/snapshots/annonars__freqs__cli__import__reading__test__multiquery.snap b/src/freqs/cli/import/snapshots/annonars__freqs__cli__import__reading__test__multiquery.snap index aa73c88b..31f6a1b0 100644 --- a/src/freqs/cli/import/snapshots/annonars__freqs__cli__import__reading__test__multiquery.snap +++ b/src/freqs/cli/import/snapshots/annonars__freqs__cli__import__reading__test__multiquery.snap @@ -1,54 +1,41 @@ --- source: src/freqs/cli/import/reading.rs +assertion_line: 354 expression: records --- [ ( 0, - Record { - chromosome: Name( - "chrM", - ), - position: Position( - 3, + RecordBuf { + reference_sequence_name: "chrM", + variant_start: Some( + Position( + 3, + ), ), ids: Ids( {}, ), - reference_bases: ReferenceBases( - [ - T, - ], - ), + reference_bases: "T", alternate_bases: AlternateBases( [ - Bases( - [ - C, - ], - ), + "C", ], ), quality_score: None, - filters: Some( - Pass, + filters: Filters( + { + "PASS", + }, ), info: Info( { - Other( - Other( - "variant_collapsed", - ), - ): Some( + "variant_collapsed": Some( String( "T3C", ), ), - Other( - Other( - "vep", - ), - ): Some( + "vep": Some( Array( String( [ @@ -59,207 +46,117 @@ expression: records ), ), ), - Other( - Other( - "base_qual_hist", - ), - ): Some( + "base_qual_hist": Some( String( "0|0|0|0|0|0|0|0|0|0", ), ), - Other( - Other( - "position_hist", - ), - ): Some( + "position_hist": Some( String( "0|0|0|0|0|0|0|0|0|0", ), ), - Other( - Other( - "strand_bias_hist", - ), - ): Some( + "strand_bias_hist": Some( String( "0|0|0|0|0|0|0|0|0|0", ), ), - Other( - Other( - "weak_evidence_hist", - ), - ): Some( + "weak_evidence_hist": Some( String( "0|0|0|0|0|0|0|0|0|0", ), ), - Other( - Other( - "contamination_hist", - ), - ): Some( + "contamination_hist": Some( String( "0|0|0|0|0|0|0|0|0|0", ), ), - Other( - Other( - "heteroplasmy_below_min_het_threshold_hist", - ), - ): Some( + "heteroplasmy_below_min_het_threshold_hist": Some( String( "0|0|0|0|0|0|0|0|0|0", ), ), - Other( - Other( - "excluded_AC", - ), - ): Some( + "excluded_AC": Some( Integer( 0, ), ), - Standard( - TotalAlleleCount, - ): Some( + "AN": Some( Integer( 56434, ), ), - Other( - Other( - "AC_hom", - ), - ): Some( + "AC_hom": Some( Integer( 19, ), ), - Other( - Other( - "AC_het", - ), - ): Some( + "AC_het": Some( Integer( 1, ), ), - Other( - Other( - "hl_hist", - ), - ): Some( + "hl_hist": Some( String( "0|0|0|0|0|0|0|0|1|19", ), ), - Other( - Other( - "dp_mean", - ), - ): Some( + "dp_mean": Some( Float( 2522.87, ), ), - Other( - Other( - "mq_mean", - ), - ): Some( + "mq_mean": Some( Float( 60.0, ), ), - Other( - Other( - "tlod_mean", - ), - ): Some( + "tlod_mean": Some( Float( 6805.54, ), ), - Other( - Other( - "AF_hom", - ), - ): Some( + "AF_hom": Some( Float( 0.000336676, ), ), - Other( - Other( - "AF_het", - ), - ): Some( + "AF_het": Some( Float( 1.77198e-5, ), ), - Other( - Other( - "max_hl", - ), - ): Some( + "max_hl": Some( Float( 0.997, ), ), - Other( - Other( - "hap_AN", - ), - ): Some( + "hap_AN": Some( String( "2680|1537|868|603|34|282|91|14784|701|934|3144|2732|663|2977|4724|5672|126|1|1298|366|7|393|3080|6037|1234|819|546|12|89", ), ), - Other( - Other( - "hap_AC_het", - ), - ): Some( + "hap_AC_het": Some( String( "0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|1|0|0|0|0|0|0", ), ), - Other( - Other( - "hap_AC_hom", - ), - ): Some( + "hap_AC_hom": Some( String( "0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0|17|1|0|0|0|0|0", ), ), - Other( - Other( - "hap_AF_hom", - ), - ): Some( + "hap_AF_hom": Some( String( "0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|6.76407e-05|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|5.51948e-03|1.65645e-04|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00", ), ), - Other( - Other( - "hap_AF_het", - ), - ): Some( + "hap_AF_het": Some( String( "0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|3.24675e-04|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00", ), ), - Other( - Other( - "hap_hl_hist", - ), - ): Some( + "hap_hl_hist": Some( Array( String( [ @@ -354,92 +251,52 @@ expression: records ), ), ), - Other( - Other( - "hap_faf_hom", - ), - ): Some( + "hap_faf_hom": Some( String( "0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|3.51633e-03|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00", ), ), - Other( - Other( - "hapmax_AF_hom", - ), - ): Some( + "hapmax_AF_hom": Some( String( "T", ), ), - Other( - Other( - "hapmax_AF_het", - ), - ): Some( + "hapmax_AF_het": Some( String( "T", ), ), - Other( - Other( - "faf_hapmax_hom", - ), - ): Some( + "faf_hapmax_hom": Some( Float( 0.00351633, ), ), - Other( - Other( - "pop_AN", - ), - ): Some( + "pop_AN": Some( String( "14347|392|5718|1415|1482|4892|25849|826|1493|20", ), ), - Other( - Other( - "pop_AC_het", - ), - ): Some( + "pop_AC_het": Some( String( "0|0|0|0|0|0|1|0|0|0", ), ), - Other( - Other( - "pop_AC_hom", - ), - ): Some( + "pop_AC_hom": Some( String( "0|0|0|0|0|0|19|0|0|0", ), ), - Other( - Other( - "pop_AF_hom", - ), - ): Some( + "pop_AF_hom": Some( String( "0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|7.35038e-04|0.00000e+00|0.00000e+00|0.00000e+00", ), ), - Other( - Other( - "pop_AF_het", - ), - ): Some( + "pop_AF_het": Some( String( "0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|3.86862e-05|0.00000e+00|0.00000e+00|0.00000e+00", ), ), - Other( - Other( - "pop_hl_hist", - ), - ): Some( + "pop_hl_hist": Some( Array( String( [ @@ -477,99 +334,59 @@ expression: records ), ), ), - Other( - Other( - "age_hist_hom_bin_freq", - ), - ): Some( + "age_hist_hom_bin_freq": Some( String( "0|1|0|2|1|0|0|2|0|0", ), ), - Other( - Other( - "age_hist_hom_n_smaller", - ), - ): Some( + "age_hist_hom_n_smaller": Some( Integer( 3, ), ), - Other( - Other( - "age_hist_hom_n_larger", - ), - ): Some( + "age_hist_hom_n_larger": Some( Integer( 0, ), ), - Other( - Other( - "age_hist_het_bin_freq", - ), - ): Some( + "age_hist_het_bin_freq": Some( String( "0|0|0|0|0|0|0|0|0|0", ), ), - Other( - Other( - "age_hist_het_n_smaller", - ), - ): Some( + "age_hist_het_n_smaller": Some( Integer( 0, ), ), - Other( - Other( - "age_hist_het_n_larger", - ), - ): Some( + "age_hist_het_n_larger": Some( Integer( 0, ), ), - Other( - Other( - "dp_hist_all_n_larger", - ), - ): Some( + "dp_hist_all_n_larger": Some( Integer( 35782, ), ), - Other( - Other( - "dp_hist_alt_n_larger", - ), - ): Some( + "dp_hist_alt_n_larger": Some( Integer( 11, ), ), - Other( - Other( - "dp_hist_all_bin_freq", - ), - ): Some( + "dp_hist_all_bin_freq": Some( String( "0|1|217|1234|2334|2600|2967|3324|3791|4184", ), ), - Other( - Other( - "dp_hist_alt_bin_freq", - ), - ): Some( + "dp_hist_alt_bin_freq": Some( String( "0|0|0|0|2|0|1|4|2|0", ), ), }, ), - genotypes: Genotypes { + samples: Samples { keys: Keys( {}, ), @@ -579,64 +396,48 @@ expression: records ), ( 1, - Record { - chromosome: Name( - "chrM", - ), - position: Position( - 5, + RecordBuf { + reference_sequence_name: "chrM", + variant_start: Some( + Position( + 5, + ), ), ids: Ids( {}, ), - reference_bases: ReferenceBases( - [ - A, - ], - ), + reference_bases: "A", alternate_bases: AlternateBases( [ - Bases( - [ - C, - ], - ), + "C", ], ), quality_score: None, - filters: Some( - Pass, + filters: Filters( + { + "PASS", + }, ), info: Info( { - Standard( - TotalAlleleCount, - ): Some( + "AN": Some( Integer( 196554, ), ), - Other( - Other( - "AC_hom", - ), - ): Some( + "AC_hom": Some( Integer( 1, ), ), - Other( - Other( - "AC_het", - ), - ): Some( + "AC_het": Some( Integer( 0, ), ), }, ), - genotypes: Genotypes { + samples: Samples { keys: Keys( {}, ), @@ -646,50 +447,31 @@ expression: records ), ( 0, - Record { - chromosome: Name( - "chrM", - ), - position: Position( - 6, + RecordBuf { + reference_sequence_name: "chrM", + variant_start: Some( + Position( + 6, + ), ), ids: Ids( {}, ), - reference_bases: ReferenceBases( - [ - C, - ], - ), + reference_bases: "C", alternate_bases: AlternateBases( [ - Bases( - [ - C, - C, - T, - C, - A, - A, - ], - ), + "CCTCAA", ], ), quality_score: None, - filters: Some( - Fail( - { - "npg", - }, - ), + filters: Filters( + { + "npg", + }, ), info: Info( { - Other( - Other( - "filters", - ), - ): Some( + "filters": Some( Array( String( [ @@ -700,20 +482,12 @@ expression: records ), ), ), - Other( - Other( - "variant_collapsed", - ), - ): Some( + "variant_collapsed": Some( String( "C6CCTCAA", ), ), - Other( - Other( - "vep", - ), - ): Some( + "vep": Some( Array( String( [ @@ -724,207 +498,117 @@ expression: records ), ), ), - Other( - Other( - "base_qual_hist", - ), - ): Some( + "base_qual_hist": Some( String( "0|0|0|0|0|0|0|0|0|0", ), ), - Other( - Other( - "position_hist", - ), - ): Some( + "position_hist": Some( String( "0|0|0|0|0|0|0|0|0|0", ), ), - Other( - Other( - "strand_bias_hist", - ), - ): Some( + "strand_bias_hist": Some( String( "0|0|0|0|0|0|0|0|0|0", ), ), - Other( - Other( - "weak_evidence_hist", - ), - ): Some( + "weak_evidence_hist": Some( String( "0|0|0|0|0|0|0|0|0|0", ), ), - Other( - Other( - "contamination_hist", - ), - ): Some( + "contamination_hist": Some( String( "0|0|0|0|0|0|0|0|0|0", ), ), - Other( - Other( - "heteroplasmy_below_min_het_threshold_hist", - ), - ): Some( + "heteroplasmy_below_min_het_threshold_hist": Some( String( "1|0|0|0|0|0|0|0|0|0", ), ), - Other( - Other( - "excluded_AC", - ), - ): Some( + "excluded_AC": Some( Integer( 1, ), ), - Standard( - TotalAlleleCount, - ): Some( + "AN": Some( Integer( 56433, ), ), - Other( - Other( - "AC_hom", - ), - ): Some( + "AC_hom": Some( Integer( 0, ), ), - Other( - Other( - "AC_het", - ), - ): Some( + "AC_het": Some( Integer( 0, ), ), - Other( - Other( - "hl_hist", - ), - ): Some( + "hl_hist": Some( String( "0|0|0|0|0|0|0|0|0|0", ), ), - Other( - Other( - "dp_mean", - ), - ): Some( + "dp_mean": Some( Float( 2527.78, ), ), - Other( - Other( - "mq_mean", - ), - ): Some( + "mq_mean": Some( Float( 60.0, ), ), - Other( - Other( - "tlod_mean", - ), - ): Some( + "tlod_mean": Some( Float( 0.537, ), ), - Other( - Other( - "AF_hom", - ), - ): Some( + "AF_hom": Some( Float( 0.0, ), ), - Other( - Other( - "AF_het", - ), - ): Some( + "AF_het": Some( Float( 0.0, ), ), - Other( - Other( - "max_hl", - ), - ): Some( + "max_hl": Some( Float( 0.0, ), ), - Other( - Other( - "hap_AN", - ), - ): Some( + "hap_AN": Some( String( "2679|1537|868|603|34|282|91|14784|701|934|3144|2732|663|2977|4724|5672|126|1|1298|366|7|393|3080|6037|1234|819|546|12|89", ), ), - Other( - Other( - "hap_AC_het", - ), - ): Some( + "hap_AC_het": Some( String( "0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0", ), ), - Other( - Other( - "hap_AC_hom", - ), - ): Some( + "hap_AC_hom": Some( String( "0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0", ), ), - Other( - Other( - "hap_AF_hom", - ), - ): Some( + "hap_AF_hom": Some( String( "0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00", ), ), - Other( - Other( - "hap_AF_het", - ), - ): Some( + "hap_AF_het": Some( String( "0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00", ), ), - Other( - Other( - "hap_hl_hist", - ), - ): Some( + "hap_hl_hist": Some( Array( String( [ @@ -1019,74 +703,42 @@ expression: records ), ), ), - Other( - Other( - "hap_faf_hom", - ), - ): Some( + "hap_faf_hom": Some( String( "0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00", ), ), - Other( - Other( - "faf_hapmax_hom", - ), - ): Some( + "faf_hapmax_hom": Some( Float( 0.0, ), ), - Other( - Other( - "pop_AN", - ), - ): Some( + "pop_AN": Some( String( "14347|392|5717|1415|1482|4892|25849|826|1493|20", ), ), - Other( - Other( - "pop_AC_het", - ), - ): Some( + "pop_AC_het": Some( String( "0|0|0|0|0|0|0|0|0|0", ), ), - Other( - Other( - "pop_AC_hom", - ), - ): Some( + "pop_AC_hom": Some( String( "0|0|0|0|0|0|0|0|0|0", ), ), - Other( - Other( - "pop_AF_hom", - ), - ): Some( + "pop_AF_hom": Some( String( "0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00", ), ), - Other( - Other( - "pop_AF_het", - ), - ): Some( + "pop_AF_het": Some( String( "0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00", ), ), - Other( - Other( - "pop_hl_hist", - ), - ): Some( + "pop_hl_hist": Some( Array( String( [ @@ -1124,99 +776,59 @@ expression: records ), ), ), - Other( - Other( - "age_hist_hom_bin_freq", - ), - ): Some( + "age_hist_hom_bin_freq": Some( String( "0|0|0|0|0|0|0|0|0|0", ), ), - Other( - Other( - "age_hist_hom_n_smaller", - ), - ): Some( + "age_hist_hom_n_smaller": Some( Integer( 0, ), ), - Other( - Other( - "age_hist_hom_n_larger", - ), - ): Some( + "age_hist_hom_n_larger": Some( Integer( 0, ), ), - Other( - Other( - "age_hist_het_bin_freq", - ), - ): Some( + "age_hist_het_bin_freq": Some( String( "0|0|0|0|0|0|0|0|0|0", ), ), - Other( - Other( - "age_hist_het_n_smaller", - ), - ): Some( + "age_hist_het_n_smaller": Some( Integer( 0, ), ), - Other( - Other( - "age_hist_het_n_larger", - ), - ): Some( + "age_hist_het_n_larger": Some( Integer( 0, ), ), - Other( - Other( - "dp_hist_all_n_larger", - ), - ): Some( + "dp_hist_all_n_larger": Some( Integer( 35855, ), ), - Other( - Other( - "dp_hist_alt_n_larger", - ), - ): Some( + "dp_hist_alt_n_larger": Some( Integer( 0, ), ), - Other( - Other( - "dp_hist_all_bin_freq", - ), - ): Some( + "dp_hist_all_bin_freq": Some( String( "0|0|216|1236|2310|2568|2964|3302|3802|4180", ), ), - Other( - Other( - "dp_hist_alt_bin_freq", - ), - ): Some( + "dp_hist_alt_bin_freq": Some( String( "0|0|0|0|0|0|0|0|0|0", ), ), }, ), - genotypes: Genotypes { + samples: Samples { keys: Keys( {}, ), @@ -1226,45 +838,31 @@ expression: records ), ( 0, - Record { - chromosome: Name( - "chrM", - ), - position: Position( - 10, + RecordBuf { + reference_sequence_name: "chrM", + variant_start: Some( + Position( + 10, + ), ), ids: Ids( {}, ), - reference_bases: ReferenceBases( - [ - T, - ], - ), + reference_bases: "T", alternate_bases: AlternateBases( [ - Bases( - [ - C, - ], - ), + "C", ], ), quality_score: None, - filters: Some( - Fail( - { - "npg", - }, - ), + filters: Filters( + { + "npg", + }, ), info: Info( { - Other( - Other( - "filters", - ), - ): Some( + "filters": Some( Array( String( [ @@ -1275,20 +873,12 @@ expression: records ), ), ), - Other( - Other( - "variant_collapsed", - ), - ): Some( + "variant_collapsed": Some( String( "A7G", ), ), - Other( - Other( - "vep", - ), - ): Some( + "vep": Some( Array( String( [ @@ -1299,199 +889,109 @@ expression: records ), ), ), - Other( - Other( - "base_qual_hist", - ), - ): Some( + "base_qual_hist": Some( String( "0|0|0|0|0|0|0|0|0|0", ), ), - Other( - Other( - "position_hist", - ), - ): Some( + "position_hist": Some( String( "0|0|0|0|0|0|0|0|0|0", ), ), - Other( - Other( - "strand_bias_hist", - ), - ): Some( + "strand_bias_hist": Some( String( "0|0|0|0|0|0|0|0|0|0", ), ), - Other( - Other( - "weak_evidence_hist", - ), - ): Some( + "weak_evidence_hist": Some( String( "0|0|0|0|0|0|0|0|0|0", ), ), - Other( - Other( - "contamination_hist", - ), - ): Some( + "contamination_hist": Some( String( "0|0|0|0|0|0|0|0|0|0", ), ), - Other( - Other( - "heteroplasmy_below_min_het_threshold_hist", - ), - ): Some( + "heteroplasmy_below_min_het_threshold_hist": Some( String( "1|0|0|0|0|0|0|0|0|0", ), ), - Other( - Other( - "excluded_AC", - ), - ): Some( + "excluded_AC": Some( Integer( 1, ), ), - Standard( - TotalAlleleCount, - ): Some( + "AN": Some( Integer( 56433, ), ), - Other( - Other( - "AC_hom", - ), - ): Some( + "AC_hom": Some( Integer( 0, ), ), - Other( - Other( - "AC_het", - ), - ): Some( + "AC_het": Some( Integer( 0, ), ), - Other( - Other( - "hl_hist", - ), - ): Some( + "hl_hist": Some( String( "0|0|0|0|0|0|0|0|0|0", ), ), - Other( - Other( - "dp_mean", - ), - ): Some( + "dp_mean": Some( Float( 2555.14, ), ), - Other( - Other( - "mq_mean", - ), - ): None, - Other( - Other( - "tlod_mean", - ), - ): None, - Other( - Other( - "AF_hom", - ), - ): Some( + "mq_mean": None, + "tlod_mean": None, + "AF_hom": Some( Float( 0.0, ), ), - Other( - Other( - "AF_het", - ), - ): Some( + "AF_het": Some( Float( 0.0, ), ), - Other( - Other( - "max_hl", - ), - ): Some( + "max_hl": Some( Float( 0.0, ), ), - Other( - Other( - "hap_AN", - ), - ): Some( + "hap_AN": Some( String( "2679|1537|868|603|34|282|91|14784|701|934|3144|2732|663|2977|4724|5672|126|1|1298|366|7|393|3080|6037|1234|819|546|12|89", ), ), - Other( - Other( - "hap_AC_het", - ), - ): Some( + "hap_AC_het": Some( String( "0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0", ), ), - Other( - Other( - "hap_AC_hom", - ), - ): Some( + "hap_AC_hom": Some( String( "0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0", ), ), - Other( - Other( - "hap_AF_hom", - ), - ): Some( + "hap_AF_hom": Some( String( "0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00", ), ), - Other( - Other( - "hap_AF_het", - ), - ): Some( + "hap_AF_het": Some( String( "0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00", ), ), - Other( - Other( - "hap_hl_hist", - ), - ): Some( + "hap_hl_hist": Some( Array( String( [ @@ -1586,74 +1086,42 @@ expression: records ), ), ), - Other( - Other( - "hap_faf_hom", - ), - ): Some( + "hap_faf_hom": Some( String( "0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00", ), ), - Other( - Other( - "faf_hapmax_hom", - ), - ): Some( + "faf_hapmax_hom": Some( Float( 0.0, ), ), - Other( - Other( - "pop_AN", - ), - ): Some( + "pop_AN": Some( String( "14347|392|5717|1415|1482|4892|25849|826|1493|20", ), ), - Other( - Other( - "pop_AC_het", - ), - ): Some( + "pop_AC_het": Some( String( "0|0|0|0|0|0|0|0|0|0", ), ), - Other( - Other( - "pop_AC_hom", - ), - ): Some( + "pop_AC_hom": Some( String( "0|0|0|0|0|0|0|0|0|0", ), ), - Other( - Other( - "pop_AF_hom", - ), - ): Some( + "pop_AF_hom": Some( String( "0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00", ), ), - Other( - Other( - "pop_AF_het", - ), - ): Some( + "pop_AF_het": Some( String( "0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00|0.00000e+00", ), ), - Other( - Other( - "pop_hl_hist", - ), - ): Some( + "pop_hl_hist": Some( Array( String( [ @@ -1691,99 +1159,59 @@ expression: records ), ), ), - Other( - Other( - "age_hist_hom_bin_freq", - ), - ): Some( + "age_hist_hom_bin_freq": Some( String( "0|0|0|0|0|0|0|0|0|0", ), ), - Other( - Other( - "age_hist_hom_n_smaller", - ), - ): Some( + "age_hist_hom_n_smaller": Some( Integer( 0, ), ), - Other( - Other( - "age_hist_hom_n_larger", - ), - ): Some( + "age_hist_hom_n_larger": Some( Integer( 0, ), ), - Other( - Other( - "age_hist_het_bin_freq", - ), - ): Some( + "age_hist_het_bin_freq": Some( String( "0|0|0|0|0|0|0|0|0|0", ), ), - Other( - Other( - "age_hist_het_n_smaller", - ), - ): Some( + "age_hist_het_n_smaller": Some( Integer( 0, ), ), - Other( - Other( - "age_hist_het_n_larger", - ), - ): Some( + "age_hist_het_n_larger": Some( Integer( 0, ), ), - Other( - Other( - "dp_hist_all_n_larger", - ), - ): Some( + "dp_hist_all_n_larger": Some( Integer( 36388, ), ), - Other( - Other( - "dp_hist_alt_n_larger", - ), - ): Some( + "dp_hist_alt_n_larger": Some( Integer( 0, ), ), - Other( - Other( - "dp_hist_all_bin_freq", - ), - ): Some( + "dp_hist_all_bin_freq": Some( String( "0|0|194|1156|2241|2524|2903|3254|3706|4067", ), ), - Other( - Other( - "dp_hist_alt_bin_freq", - ), - ): Some( + "dp_hist_alt_bin_freq": Some( String( "0|0|0|0|0|0|0|0|0|0", ), ), }, ), - genotypes: Genotypes { + samples: Samples { keys: Keys( {}, ), @@ -1793,64 +1221,48 @@ expression: records ), ( 1, - Record { - chromosome: Name( - "chrM", - ), - position: Position( - 10, + RecordBuf { + reference_sequence_name: "chrM", + variant_start: Some( + Position( + 10, + ), ), ids: Ids( {}, ), - reference_bases: ReferenceBases( - [ - T, - ], - ), + reference_bases: "T", alternate_bases: AlternateBases( [ - Bases( - [ - C, - ], - ), + "C", ], ), quality_score: None, - filters: Some( - Pass, + filters: Filters( + { + "PASS", + }, ), info: Info( { - Standard( - TotalAlleleCount, - ): Some( + "AN": Some( Integer( 196554, ), ), - Other( - Other( - "AC_hom", - ), - ): Some( + "AC_hom": Some( Integer( 7, ), ), - Other( - Other( - "AC_het", - ), - ): Some( + "AC_het": Some( Integer( 1, ), ), }, ), - genotypes: Genotypes { + samples: Samples { keys: Keys( {}, ), @@ -1860,64 +1272,48 @@ expression: records ), ( 1, - Record { - chromosome: Name( - "chrM", - ), - position: Position( - 11, + RecordBuf { + reference_sequence_name: "chrM", + variant_start: Some( + Position( + 11, + ), ), ids: Ids( {}, ), - reference_bases: ReferenceBases( - [ - C, - ], - ), + reference_bases: "C", alternate_bases: AlternateBases( [ - Bases( - [ - T, - ], - ), + "T", ], ), quality_score: None, - filters: Some( - Pass, + filters: Filters( + { + "PASS", + }, ), info: Info( { - Standard( - TotalAlleleCount, - ): Some( + "AN": Some( Integer( 196554, ), ), - Other( - Other( - "AC_hom", - ), - ): Some( + "AC_hom": Some( Integer( 0, ), ), - Other( - Other( - "AC_het", - ), - ): Some( + "AC_het": Some( Integer( 1, ), ), }, ), - genotypes: Genotypes { + samples: Samples { keys: Keys( {}, ), diff --git a/src/freqs/cli/import/xy.rs b/src/freqs/cli/import/xy.rs index 04b86237..6c713382 100644 --- a/src/freqs/cli/import/xy.rs +++ b/src/freqs/cli/import/xy.rs @@ -7,8 +7,8 @@ fn write_record( db: &rocksdb::DBWithThreadMode, cf: &std::sync::Arc, record_key: &common::keys::Var, - record_genome: &mut Option, - record_exome: &mut Option, + record_genome: &mut Option, + record_exome: &mut Option, ) -> Result<(), anyhow::Error> { if record_genome.is_none() && record_exome.is_none() { // Early exit, nothing to write out. @@ -56,11 +56,14 @@ pub fn import_region( let mut readers = Vec::new(); if let Some(path_genome) = path_genome { is_genome.push(true); - readers.push(noodles_vcf::indexed_reader::Builder::default().build_from_path(path_genome)?); + readers.push( + noodles_vcf::io::indexed_reader::Builder::default().build_from_path(path_genome)?, + ); } if let Some(path_exome) = path_exome { is_genome.push(false); - readers.push(noodles_vcf::indexed_reader::Builder::default().build_from_path(path_exome)?); + readers + .push(noodles_vcf::io::indexed_reader::Builder::default().build_from_path(path_exome)?); } // Read headers. let headers: Vec<_> = readers @@ -75,7 +78,7 @@ pub fn import_region( .map(|(reader, header)| reader.query(header, region)) .collect::>()?; // Construct the `MultiQuery`. - let multi_query = super::reading::MultiQuery::new(queries)?; + let multi_query = super::reading::MultiQuery::new(queries, &headers)?; // Now iterate over the `MultiQuery` and write to the database. // diff --git a/src/freqs/serialized/auto.rs b/src/freqs/serialized/auto.rs index e403e937..c459d844 100644 --- a/src/freqs/serialized/auto.rs +++ b/src/freqs/serialized/auto.rs @@ -1,6 +1,7 @@ //! Autosomal counts. use byteorder::{ByteOrder, LittleEndian}; +use noodles_vcf::variant::record::AlternateBases; use crate::common::noodles; @@ -17,7 +18,7 @@ pub struct Counts { impl Counts { /// Create from the given VCF record. - pub fn from_vcf_allele(value: &noodles_vcf::Record, _allele_no: usize) -> Self { + pub fn from_vcf_allele(value: &noodles_vcf::variant::RecordBuf, _allele_no: usize) -> Self { tracing::trace!("@ {:?}", &value); assert_eq!( value.alternate_bases().len(), diff --git a/src/freqs/serialized/mt.rs b/src/freqs/serialized/mt.rs index e4ba61f8..de33fc88 100644 --- a/src/freqs/serialized/mt.rs +++ b/src/freqs/serialized/mt.rs @@ -1,6 +1,7 @@ //! Mitochondrial counts. use byteorder::{ByteOrder, LittleEndian}; +use noodles_vcf::variant::record::AlternateBases; use crate::common::noodles; // use noodles_vcf::{ @@ -22,7 +23,7 @@ pub struct Counts { impl Counts { /// Create from the given VCF record. - pub fn from_vcf_allele(value: &noodles_vcf::Record, _allele_no: usize) -> Self { + pub fn from_vcf_allele(value: &noodles_vcf::variant::RecordBuf, _allele_no: usize) -> Self { assert_eq!( value.alternate_bases().len(), 1, diff --git a/src/freqs/serialized/xy.rs b/src/freqs/serialized/xy.rs index dabbed82..17301073 100644 --- a/src/freqs/serialized/xy.rs +++ b/src/freqs/serialized/xy.rs @@ -1,6 +1,7 @@ //! gonosomal counts. use byteorder::{ByteOrder, LittleEndian}; +use noodles_vcf::variant::record::AlternateBases; use crate::common::noodles; @@ -19,7 +20,7 @@ pub struct Counts { impl Counts { /// Create from the given VCF record. - pub fn from_vcf_allele(value: &noodles_vcf::Record, _allele_no: usize) -> Self { + pub fn from_vcf_allele(value: &noodles_vcf::variant::RecordBuf, _allele_no: usize) -> Self { assert_eq!( value.alternate_bases().len(), 1, diff --git a/src/gnomad_mtdna/cli/import.rs b/src/gnomad_mtdna/cli/import.rs index 1bbe02f6..fb69e4f2 100644 --- a/src/gnomad_mtdna/cli/import.rs +++ b/src/gnomad_mtdna/cli/import.rs @@ -5,6 +5,8 @@ use std::sync::Arc; use clap::Parser; use indicatif::ParallelProgressIterator as _; use noodles_csi::BinningIndex as _; +use noodles_vcf::variant::record::AlternateBases; +use noodles_vcf::variant::RecordBuf; use prost::Message as _; use rayon::prelude::{IntoParallelRefIterator, ParallelIterator}; @@ -109,7 +111,7 @@ fn process_window( ) -> Result<(), anyhow::Error> { let cf_gnomad = db.cf_handle(&args.cf_name).unwrap(); let mut reader = - noodles_vcf::indexed_reader::Builder::default().build_from_path(&args.path_in_vcf)?; + noodles_vcf::io::indexed_reader::Builder::default().build_from_path(&args.path_in_vcf)?; let header = reader.read_header()?; let raw_region = format!("{}:{}-{}", chrom, begin + 1, end); @@ -136,6 +138,7 @@ fn process_window( if let Some(query) = query { for result in query { let vcf_record = result?; + let vcf_record = RecordBuf::try_from_variant_record(&header, &vcf_record)?; // Process each alternate allele into one record. let details_options = serde_json::from_str( diff --git a/src/gnomad_nuclear/cli/import.rs b/src/gnomad_nuclear/cli/import.rs index d6d54be2..2b301ed2 100644 --- a/src/gnomad_nuclear/cli/import.rs +++ b/src/gnomad_nuclear/cli/import.rs @@ -6,6 +6,8 @@ use clap::Parser; use indicatif::ParallelProgressIterator; use noodles_csi::BinningIndex as _; use noodles_vcf::header::record; +use noodles_vcf::variant::record::AlternateBases; +use noodles_vcf::variant::RecordBuf; use prost::Message; use rayon::prelude::{IntoParallelRefIterator, ParallelIterator}; @@ -174,7 +176,7 @@ fn process_window( ) -> Result<(), anyhow::Error> { let cf_gnomad = db.cf_handle(&args.cf_name).unwrap(); let mut reader = - noodles_vcf::indexed_reader::Builder::default().build_from_path(path_in_vcf)?; + noodles_vcf::io::indexed_reader::Builder::default().build_from_path(path_in_vcf)?; let header = reader.read_header()?; let raw_region = format!("{}:{}-{}", chrom, begin + 1, end); @@ -201,6 +203,7 @@ fn process_window( if let Some(query) = query { for result in query { let vcf_record = result?; + let vcf_record = RecordBuf::try_from_variant_record(&header, &vcf_record)?; // Process each alternate allele into one record. for allele_no in 0..vcf_record.alternate_bases().len() { @@ -328,7 +331,7 @@ pub fn run(common: &common::cli::Args, args: &Args) -> Result<(), anyhow::Error> tracing::info!("Opening gnomAD-nuclear VCF file..."); let before_loading = std::time::Instant::now(); let mut reader_vcf = - noodles_vcf::reader::Builder::default().build_from_path(&args.path_in_vcf[0])?; + noodles_vcf::io::reader::Builder::default().build_from_path(&args.path_in_vcf[0])?; let header = reader_vcf.read_header()?; let vep_version = if let Some(record::value::Collection::Unstructured(values)) = header diff --git a/src/gnomad_sv/cli/import/gnomad_cnv4.rs b/src/gnomad_sv/cli/import/gnomad_cnv4.rs index 8e77ab34..b2304dee 100644 --- a/src/gnomad_sv/cli/import/gnomad_cnv4.rs +++ b/src/gnomad_sv/cli/import/gnomad_cnv4.rs @@ -1,5 +1,7 @@ //! gnomAD CNV v4 import. +use itertools::Itertools; +use noodles_vcf::variant::record::Ids; use std::{str::FromStr, sync::Arc}; use crate::{ @@ -75,11 +77,14 @@ impl Record { /// /// * Any error encountered during the creation. pub fn from_vcf_record( - record: &noodles_vcf::Record, + record: &noodles_vcf::variant::RecordBuf, cohort_name: &str, ) -> Result { - let chrom = record.chromosome().to_string(); - let start: usize = record.position().into(); + let chrom = record.reference_sequence_name().to_string(); + let start: usize = record + .variant_start() + .expect("Telomeric breakends not supported") + .get(); let stop = get_i32(record, "END").expect("no END?"); let inner_start = get_i32(record, "POSMAX").expect("no POSMAX?"); let outer_start = get_i32(record, "POSMIN").expect("no POSMIN?"); @@ -147,7 +152,7 @@ impl Record { /// Extract allele counts from VCF record. fn carrier_counts_by_sex_from_vcf_record( - record: &noodles_vcf::Record, + record: &noodles_vcf::variant::RecordBuf, population: Option, ) -> Result { let pop_prefix = population @@ -163,7 +168,7 @@ impl Record { /// Extract allele counts for a given population from VCF record. fn extract_carrier_counts( - record: &noodles_vcf::Record, + record: &noodles_vcf::variant::RecordBuf, prefix: &str, ) -> Result { let sc = get_f32(record, &format!("{}SC", prefix)).unwrap_or_default() as i32; @@ -217,12 +222,13 @@ pub fn import( }; tracing::info!("importing gnomAD-CNV v4 {} cohort", cohort_name); - let mut reader = noodles_vcf::reader::Builder::default().build_from_path(path_in_vcf)?; + let mut reader = noodles_vcf::io::reader::Builder::default().build_from_path(path_in_vcf)?; let header = reader.read_header()?; - for result in reader.records(&header) { + for result in reader.record_bufs(&header) { let vcf_record = result?; - let key = format!("{}", vcf_record.ids()).into_bytes(); + // TODO make sure this doesn't change anything + let key = vcf_record.ids().as_ref().iter().join(",").into_bytes(); // Build record for VCF record. let record = Record::from_vcf_record(&vcf_record, cohort_name) diff --git a/src/gnomad_sv/cli/import/gnomad_sv2.rs b/src/gnomad_sv/cli/import/gnomad_sv2.rs index b5aadfaa..da2947c0 100644 --- a/src/gnomad_sv/cli/import/gnomad_sv2.rs +++ b/src/gnomad_sv/cli/import/gnomad_sv2.rs @@ -5,6 +5,10 @@ use std::{str::FromStr, sync::Arc}; +use itertools::Itertools; +use noodles_vcf::variant::record::Ids; +use prost::Message; + use crate::{ common::noodles::{get_f32, get_i32, get_string}, pbs::gnomad::gnomad_sv2::{ @@ -13,8 +17,6 @@ use crate::{ }, }; -use prost::Message; - impl FromStr for Filter { type Err = anyhow::Error; @@ -125,12 +127,15 @@ impl Record { /// /// * Any error encountered during the creation. pub fn from_vcf_record( - record: &noodles_vcf::Record, + record: &noodles_vcf::variant::RecordBuf, cohort_name: &str, ) -> Result { - let chrom = record.chromosome().to_string(); - let pos: usize = record.position().into(); - let pos = pos as i32; + let chrom = record.reference_sequence_name().to_string(); + let pos: usize = record + .variant_start() + .expect("Telomeric breakends not supported") + .get(); + let pos = i32::try_from(pos)?; let end = get_i32(record, "END").ok(); let chrom2 = get_string(record, "CHROM2").ok(); let end2 = get_i32(record, "END2").ok(); @@ -140,25 +145,20 @@ impl Record { .next() .map(|s| s.to_string()) .ok_or_else(|| anyhow::anyhow!("no ID found in VCF record"))?; - let filters = record - .filters() - .map(|f| -> Result<_, anyhow::Error> { - use noodles_vcf::record::Filters::*; - Ok(match f { - Pass => vec![Filter::Pass as i32], - Fail(f) => { - let mut result = f - .iter() - .map(|s| s.parse::().map(|f| f as i32)) - .collect::, _>>() - .map_err(|e| anyhow::anyhow!("problem parsing FILTER: {}", e))?; - result.sort(); - result - } - }) - }) - .transpose()? - .unwrap_or_else(|| vec![Filter::Pass as i32]); + let filters = if record.filters().is_pass() { + vec![Filter::Pass as i32] + } else { + let mut result = record + .filters() + .as_ref() + .iter() + .map(|s| s.parse::().map(|f| f as i32)) + .collect::, _>>() + .map_err(|e| anyhow::anyhow!("problem parsing FILTER: {}", e))?; + result.sort(); + result + }; + let sv_type = get_string(record, "SVTYPE")? .parse::() .map(|x| x as i32)?; @@ -184,7 +184,7 @@ impl Record { /// Extract allele counts from VCF record. fn allele_counts_from_vcf_record( - record: &noodles_vcf::Record, + record: &noodles_vcf::variant::RecordBuf, cohort_name: &str, ) -> Result { let cohort = if cohort_name == "all" { @@ -224,7 +224,7 @@ impl Record { /// Extract poulation allele counts. fn extract_population_allele_counts( - record: &noodles_vcf::Record, + record: &noodles_vcf::variant::RecordBuf, population: Population, ) -> Result { let pop_str = population.to_string(); @@ -246,7 +246,7 @@ impl Record { /// Extract allele counts for a given population from VCF record. fn extract_allele_counts( - record: &noodles_vcf::Record, + record: &noodles_vcf::variant::RecordBuf, prefix: &str, population: &str, ) -> Result { @@ -327,12 +327,12 @@ pub fn import( }; tracing::info!("importing gnomAD-SV v2 {} cohort", cohort_name); - let mut reader = noodles_vcf::reader::Builder::default().build_from_path(path_in_vcf)?; + let mut reader = noodles_vcf::io::reader::Builder::default().build_from_path(path_in_vcf)?; let header = reader.read_header()?; - for result in reader.records(&header) { + for result in reader.record_bufs(&header) { let vcf_record = result?; - let key = format!("{}", vcf_record.ids()).into_bytes(); + let key = vcf_record.ids().as_ref().iter().join(",").into_bytes(); // Build record for VCF record. let record = Record::from_vcf_record(&vcf_record, cohort_name) diff --git a/src/gnomad_sv/cli/import/gnomad_sv4.rs b/src/gnomad_sv/cli/import/gnomad_sv4.rs index 87f00222..d2785fe3 100644 --- a/src/gnomad_sv/cli/import/gnomad_sv4.rs +++ b/src/gnomad_sv/cli/import/gnomad_sv4.rs @@ -5,6 +5,11 @@ use std::{str::FromStr, sync::Arc}; +use indicatif::ParallelProgressIterator as _; +use noodles_vcf::variant::record::Ids; +use prost::Message as _; +use rayon::iter::{IntoParallelRefIterator as _, ParallelIterator as _}; + use crate::{ common::{ self, @@ -17,10 +22,6 @@ use crate::{ }, }; -use indicatif::ParallelProgressIterator as _; -use prost::Message as _; -use rayon::iter::{IntoParallelRefIterator as _, ParallelIterator as _}; - impl FromStr for Filter { type Err = anyhow::Error; @@ -120,10 +121,15 @@ impl Record { /// # Errors /// /// * Any error encountered during the creation. - pub fn from_vcf_record(record: &noodles_vcf::Record) -> Result { - let chrom = record.chromosome().to_string(); - let pos: usize = record.position().into(); - let pos = pos as i32; + pub fn from_vcf_record( + record: &noodles_vcf::variant::RecordBuf, + ) -> Result { + let chrom = record.reference_sequence_name().to_string(); + let pos: usize = record + .variant_start() + .expect("Telomeric breakends not supported") + .get(); + let pos = i32::try_from(pos)?; let end = get_i32(record, "END").ok(); let chrom2 = get_string(record, "CHROM2").ok(); let end2 = get_i32(record, "END2").ok(); @@ -133,25 +139,20 @@ impl Record { .next() .map(|s| s.to_string()) .ok_or_else(|| anyhow::anyhow!("no ID found in VCF record"))?; - let filters = record - .filters() - .map(|f| -> Result<_, anyhow::Error> { - use noodles_vcf::record::Filters::*; - Ok(match f { - Pass => vec![Filter::Pass as i32], - Fail(f) => { - let mut result = f - .iter() - .map(|s| s.parse::().map(|f| f as i32)) - .collect::, _>>() - .map_err(|e| anyhow::anyhow!("problem parsing FILTER: {}", e))?; - result.sort(); - result - } - }) - }) - .transpose()? - .unwrap_or_else(|| vec![Filter::Pass as i32]); + let filters = if record.filters().is_pass() { + vec![Filter::Pass as i32] + } else { + let mut result = record + .filters() + .as_ref() + .iter() + .map(|s| s.parse::().map(|f| f as i32)) + .collect::, _>>() + .map_err(|e| anyhow::anyhow!("problem parsing FILTER: {}", e))?; + result.sort(); + result + }; + let sv_type = get_string(record, "SVTYPE")? .parse::() .map(|x| x as i32)?; @@ -180,7 +181,7 @@ impl Record { /// Extract allele counts from VCF record. fn allele_counts_from_vcf_record( - record: &noodles_vcf::Record, + record: &noodles_vcf::variant::RecordBuf, cohort_name: &str, ) -> Result { let cohort = if cohort_name == "all" { @@ -225,7 +226,7 @@ impl Record { /// Extract poulation allele counts. fn extract_population_allele_counts( - record: &noodles_vcf::Record, + record: &noodles_vcf::variant::RecordBuf, population: Population, ) -> Result { let pop_str = population.to_string(); @@ -247,7 +248,7 @@ impl Record { /// Extract allele counts for a given population from VCF record. fn extract_allele_counts( - record: &noodles_vcf::Record, + record: &noodles_vcf::variant::RecordBuf, prefix: &str, population: &str, ) -> Result { @@ -317,13 +318,15 @@ fn import_file( cf_data_name: &str, path_in_vcf: &str, ) -> Result<(), anyhow::Error> { - let mut reader = noodles_vcf::reader::Builder::default().build_from_path(path_in_vcf)?; + let mut reader = noodles_vcf::io::reader::Builder::default().build_from_path(path_in_vcf)?; let header = reader.read_header()?; let cf_data = db.cf_handle(cf_data_name).unwrap(); - for result in reader.records(&header) { + for result in reader.record_bufs(&header) { let vcf_record = result?; - let key = format!("{}", vcf_record.ids()).into_bytes(); + // TODO check if this key is the same as before + use itertools::Itertools; + let key = vcf_record.ids().as_ref().iter().join(",").into_bytes(); // Build record for VCF record. let record = Record::from_vcf_record(&vcf_record) diff --git a/src/helixmtdb/cli/import.rs b/src/helixmtdb/cli/import.rs index 5e7189dc..d51164d7 100644 --- a/src/helixmtdb/cli/import.rs +++ b/src/helixmtdb/cli/import.rs @@ -5,6 +5,8 @@ use std::sync::Arc; use clap::Parser; use indicatif::ParallelProgressIterator; use noodles_csi::BinningIndex as _; +use noodles_vcf::variant::record::AlternateBases; +use noodles_vcf::variant::RecordBuf; use prost::Message; use rayon::prelude::{IntoParallelRefIterator, ParallelIterator}; @@ -98,7 +100,7 @@ fn process_window( ) -> Result<(), anyhow::Error> { let cf_helix = db.cf_handle(&args.cf_name).unwrap(); let mut reader = - noodles_vcf::indexed_reader::Builder::default().build_from_path(&args.path_in_vcf)?; + noodles_vcf::io::indexed_reader::Builder::default().build_from_path(&args.path_in_vcf)?; let header = reader.read_header()?; let raw_region = format!("{}:{}-{}", chrom, begin + 1, end); @@ -125,6 +127,7 @@ fn process_window( if let Some(query) = query { for result in query { let vcf_record = result?; + let vcf_record = RecordBuf::try_from_variant_record(&header, &vcf_record)?; // Process each alternate allele into one record. for allele_no in 0..vcf_record.alternate_bases().len() { diff --git a/src/helixmtdb/pbs.rs b/src/helixmtdb/pbs.rs index 67184487..a1819bac 100644 --- a/src/helixmtdb/pbs.rs +++ b/src/helixmtdb/pbs.rs @@ -1,60 +1,59 @@ //! Data structures for (de-)serialization as generated by `prost-build`. -use std::str::FromStr; +use noodles_vcf::variant::record::AlternateBases; pub use crate::pbs::helixmtdb::Record; -use noodles_vcf::record::info::field; +use noodles_vcf::variant::record_buf::info::field; impl Record { /// Creates a new `Record` from a VCF record and allele number. pub fn from_vcf_allele( - record: &noodles_vcf::record::Record, + record: &noodles_vcf::variant::RecordBuf, allele_no: usize, ) -> Result { - let chrom = record.chromosome().to_string(); - let pos: usize = record.position().into(); - let pos = pos as i32; + let chrom = record.reference_sequence_name().to_string(); + let pos: usize = record + .variant_start() + .expect("Telomeric breakends not supported") + .get(); + let pos = i32::try_from(pos)?; let ref_allele = record.reference_bases().to_string(); let alt_allele = record .alternate_bases() - .get(allele_no) - .ok_or_else(|| anyhow::anyhow!("no such allele: {}", allele_no))? + .iter() + .nth(allele_no) + .ok_or_else(|| anyhow::anyhow!("no such allele: {}", allele_no))?? .to_string(); - let num_total = if let Some(Some(field::Value::Integer(num_total))) = - record.info().get(&field::Key::from_str("AN")?) - { - *num_total - } else { - anyhow::bail!("missing INFO/AN in HelixMtDb record") - }; - let num_het = if let Some(Some(field::Value::Integer(num_het))) = - record.info().get(&field::Key::from_str("AC_het")?) - { - *num_het - } else { - anyhow::bail!("missing INFO/AC in HelixMtDb record") - }; - let num_hom = if let Some(Some(field::Value::Integer(num_hom))) = - record.info().get(&field::Key::from_str("AC_hom")?) - { - *num_hom - } else { - anyhow::bail!("missing INFO/AC_hom in HelixMtDb record") - }; - let feature_type = if let Some(Some(field::Value::String(feature))) = - record.info().get(&field::Key::from_str("FEATURE")?) - { - feature.to_string() - } else { - anyhow::bail!("missing INFO/FEATURE in HelixMtDb record") - }; - let gene_name = if let Some(Some(field::Value::String(gene_name))) = - record.info().get(&field::Key::from_str("GENE")?) - { - gene_name.to_string() - } else { - anyhow::bail!("missing INFO/GENE in HelixMtDb record") - }; + let num_total = + if let Some(Some(field::Value::Integer(num_total))) = record.info().get("AN") { + *num_total + } else { + anyhow::bail!("missing INFO/AN in HelixMtDb record") + }; + let num_het = + if let Some(Some(field::Value::Integer(num_het))) = record.info().get("AC_het") { + *num_het + } else { + anyhow::bail!("missing INFO/AC in HelixMtDb record") + }; + let num_hom = + if let Some(Some(field::Value::Integer(num_hom))) = record.info().get("AC_hom") { + *num_hom + } else { + anyhow::bail!("missing INFO/AC_hom in HelixMtDb record") + }; + let feature_type = + if let Some(Some(field::Value::String(feature))) = record.info().get("FEATURE") { + feature.to_string() + } else { + anyhow::bail!("missing INFO/FEATURE in HelixMtDb record") + }; + let gene_name = + if let Some(Some(field::Value::String(gene_name))) = record.info().get("GENE") { + gene_name.to_string() + } else { + anyhow::bail!("missing INFO/GENE in HelixMtDb record") + }; Ok(Record { chrom, diff --git a/src/pbs/gnomad/gnomad2.rs b/src/pbs/gnomad/gnomad2.rs index c0450a60..c9e4d2ce 100644 --- a/src/pbs/gnomad/gnomad2.rs +++ b/src/pbs/gnomad/gnomad2.rs @@ -1,8 +1,9 @@ //! Code generate for protobufs by `prost-build`. +use noodles_vcf::variant::record::AlternateBases; use std::str::FromStr; -use noodles_vcf::record::info::field; +use noodles_vcf::variant::record_buf::info::field; use crate::common; @@ -85,7 +86,7 @@ impl DetailsOptions { impl Record { /// Creates a new `Record` from a VCF record and allele number. pub fn from_vcf_allele( - record: &noodles_vcf::record::Record, + record: &noodles_vcf::variant::RecordBuf, allele_no: usize, options: &DetailsOptions, ) -> Result { @@ -94,14 +95,18 @@ impl Record { assert!(allele_no == 0, "only allele 0 is supported"); // Extract mandatory fields. - let chrom = record.chromosome().to_string(); - let pos: usize = record.position().into(); - let pos = pos as i32; + let chrom = record.reference_sequence_name().to_string(); + let pos: usize = record + .variant_start() + .expect("Telomeric breakends not supported") + .get(); + let pos = i32::try_from(pos).unwrap(); let ref_allele = record.reference_bases().to_string(); let alt_allele = record .alternate_bases() - .get(allele_no) - .ok_or_else(|| anyhow::anyhow!("no such allele: {}", allele_no))? + .iter() + .nth(allele_no) + .ok_or_else(|| anyhow::anyhow!("no such allele: {}", allele_no))?? .to_string(); let filters = Self::extract_filters(record)?; let allele_counts = Self::extract_cohorts_allele_counts(record, options)?; @@ -159,10 +164,10 @@ impl Record { /// Extract the "vep" field into gnomAD v2 `Vep` records. fn extract_vep( - record: &noodles_vcf::Record, + record: &noodles_vcf::variant::RecordBuf, ) -> Result, anyhow::Error> { if let Some(Some(field::Value::Array(field::value::Array::String(v)))) = - record.info().get(&field::Key::from_str("vep")?) + record.info().get("vep") { v.iter() .flat_map(|v| { @@ -184,7 +189,7 @@ impl Record { /// Extract the liftover related fields into gnomAD v2 `Vep` records. fn extract_liftover( - record: &noodles_vcf::Record, + record: &noodles_vcf::variant::RecordBuf, ) -> Result, anyhow::Error> { let tmp = LiftoverInfo { reverse_complemented_alleles: common::noodles::get_flag( @@ -210,7 +215,9 @@ impl Record { } /// Extract the details on the random forest. - fn extract_rf_info(record: &noodles_vcf::Record) -> Result { + fn extract_rf_info( + record: &noodles_vcf::variant::RecordBuf, + ) -> Result { Ok(RandomForestInfo { rf_tp_probability: common::noodles::get_f32(record, "rf_tp_probability")?, rf_positive_label: common::noodles::get_flag(record, "rf_positive_label")?, @@ -221,7 +228,9 @@ impl Record { } /// Extract the details on the variant. - fn extract_variant_info(record: &noodles_vcf::Record) -> Result { + fn extract_variant_info( + record: &noodles_vcf::variant::RecordBuf, + ) -> Result { Ok(VariantInfo { variant_type: common::noodles::get_string(record, "variant_type")?, allele_type: common::noodles::get_string(record, "allele_type")?, @@ -232,10 +241,12 @@ impl Record { } /// Extract the filters fields. - fn extract_filters(record: &noodles_vcf::Record) -> Result, anyhow::Error> { + fn extract_filters( + record: &noodles_vcf::variant::RecordBuf, + ) -> Result, anyhow::Error> { Ok( if let Some(Some(field::Value::Array(field::value::Array::String(value)))) = - record.info().get(&field::Key::from_str("filters")?) + record.info().get("filters") { value .iter() @@ -255,7 +266,7 @@ impl Record { } /// Extract the age related fields from the VCF record. - fn extract_age(record: &noodles_vcf::record::Record) -> Result { + fn extract_age(record: &noodles_vcf::variant::RecordBuf) -> Result { Ok(AgeInfo { age_hist_hom_bin_freq: common::noodles::get_vec::(record, "age_hist_hom_bin_freq") .unwrap_or_default(), @@ -269,7 +280,7 @@ impl Record { } /// Extract the depth related fields from the VCF record. - fn extract_depth(record: &noodles_vcf::record::Record) -> Result { + fn extract_depth(record: &noodles_vcf::variant::RecordBuf) -> Result { Ok(DepthInfo { dp_hist_all_n_larger: common::noodles::get_i32(record, "dp_hist_all_n_larger").ok(), dp_hist_alt_n_larger: common::noodles::get_i32(record, "dp_hist_alt_n_larger").ok(), @@ -281,7 +292,9 @@ impl Record { } /// Extract the quality-related fields from the VCF record. - fn extract_quality(record: &noodles_vcf::record::Record) -> Result { + fn extract_quality( + record: &noodles_vcf::variant::RecordBuf, + ) -> Result { Ok(QualityInfo { fs: common::noodles::get_f32(record, "FS").ok(), inbreeding_coeff: common::noodles::get_f32(record, "InbreedingCoeff").ok(), @@ -313,7 +326,7 @@ impl Record { /// Extract the allele counts from the `record` as configured in `options`. fn extract_cohorts_allele_counts( - record: &noodles_vcf::Record, + record: &noodles_vcf::variant::RecordBuf, options: &DetailsOptions, ) -> Result, anyhow::Error> { // Initialize global cohort. We will always extract the non-population specific @@ -388,7 +401,7 @@ impl Record { /// Extrac the population allele counts from the `record`. fn extract_population_allele_counts( - record: &noodles_vcf::Record, + record: &noodles_vcf::variant::RecordBuf, prefix: &str, pop: &str, ) -> Result { @@ -408,7 +421,7 @@ impl Record { /// Extract the allele counts from the `record` with the given prefix and suffix. fn extract_allele_counts( - record: &noodles_vcf::Record, + record: &noodles_vcf::variant::RecordBuf, prefix: &str, suffix: &str, ) -> Result, anyhow::Error> { @@ -438,11 +451,12 @@ mod test { #[test] fn test_record_from_vcf_allele_gnomad_exomes_grch37() -> Result<(), anyhow::Error> { let path_vcf = "tests/gnomad-nuclear/example-exomes-grch37/v2.1/gnomad-exomes.vcf"; - let mut reader_vcf = noodles_vcf::reader::Builder::default().build_from_path(path_vcf)?; + let mut reader_vcf = + noodles_vcf::io::reader::Builder::default().build_from_path(path_vcf)?; let header = reader_vcf.read_header()?; let mut records = Vec::new(); - for row in reader_vcf.records(&header) { + for row in reader_vcf.record_bufs(&header) { let vcf_record = row?; let record = Record::from_vcf_allele(&vcf_record, 0, &DetailsOptions::with_all_enabled())?; @@ -457,11 +471,12 @@ mod test { #[test] fn test_record_from_vcf_allele_gnomad_genomes_grch37() -> Result<(), anyhow::Error> { let path_vcf = "tests/gnomad-nuclear/example-genomes-grch37/v2.1/gnomad-genomes.vcf"; - let mut reader_vcf = noodles_vcf::reader::Builder::default().build_from_path(path_vcf)?; + let mut reader_vcf = + noodles_vcf::io::reader::Builder::default().build_from_path(path_vcf)?; let header = reader_vcf.read_header()?; let mut records = Vec::new(); - for row in reader_vcf.records(&header) { + for row in reader_vcf.record_bufs(&header) { let vcf_record = row?; let record = Record::from_vcf_allele(&vcf_record, 0, &DetailsOptions::with_all_enabled())?; @@ -476,11 +491,12 @@ mod test { #[test] fn test_record_from_vcf_allele_gnomad_exomes_grch38() -> Result<(), anyhow::Error> { let path_vcf = "tests/gnomad-nuclear/example-exomes-grch38/v2.1/gnomad-exomes.vcf"; - let mut reader_vcf = noodles_vcf::reader::Builder::default().build_from_path(path_vcf)?; + let mut reader_vcf = + noodles_vcf::io::reader::Builder::default().build_from_path(path_vcf)?; let header = reader_vcf.read_header()?; let mut records = Vec::new(); - for row in reader_vcf.records(&header) { + for row in reader_vcf.record_bufs(&header) { let vcf_record = row?; let record = Record::from_vcf_allele(&vcf_record, 0, &DetailsOptions::with_all_enabled())?; diff --git a/src/pbs/gnomad/gnomad3.rs b/src/pbs/gnomad/gnomad3.rs index 78039226..6e4715d5 100644 --- a/src/pbs/gnomad/gnomad3.rs +++ b/src/pbs/gnomad/gnomad3.rs @@ -1,8 +1,9 @@ //! Code generate for protobufs by `prost-build`. +use noodles_vcf::variant::record::AlternateBases; use std::str::FromStr; -use noodles_vcf::record::info::field; +use noodles_vcf::variant::record_buf::info::field; use crate::common; @@ -86,23 +87,25 @@ impl DetailsOptions { impl Record { /// Creates a new `Record` from a VCF record and allele number. pub fn from_vcf_allele( - record: &noodles_vcf::record::Record, + record: &noodles_vcf::variant::RecordBuf, allele_no: usize, options: &DetailsOptions, ) -> Result { assert!(allele_no == 0, "only allele 0 is supported"); - assert!(allele_no == 0, "only allele 0 is supported"); - // Extract mandatory fields. - let chrom = record.chromosome().to_string(); - let pos: usize = record.position().into(); + let chrom = record.reference_sequence_name().to_string(); + let pos: usize = record + .variant_start() + .expect("Telomeric breakends not supported") + .get(); let pos = pos as i32; let ref_allele = record.reference_bases().to_string(); let alt_allele = record .alternate_bases() - .get(allele_no) - .ok_or_else(|| anyhow::anyhow!("no such allele: {}", allele_no))? + .iter() + .nth(allele_no) + .ok_or_else(|| anyhow::anyhow!("no such allele: {}", allele_no))?? .to_string(); let filters = Self::extract_filters(record)?; let allele_counts = Self::extract_cohorts_allele_counts(record, options)?; @@ -154,10 +157,10 @@ impl Record { /// Extract the "vep" field into gnomAD v3 `Vep` records. pub(crate) fn extract_vep( - record: &noodles_vcf::Record, + record: &noodles_vcf::variant::RecordBuf, ) -> Result, anyhow::Error> { if let Some(Some(field::Value::Array(field::value::Array::String(v)))) = - record.info().get(&field::Key::from_str("vep")?) + record.info().get("vep") { v.iter() .flat_map(|v| { @@ -179,7 +182,7 @@ impl Record { /// Extract the details on the variant. pub(crate) fn extract_variant_info( - record: &noodles_vcf::Record, + record: &noodles_vcf::variant::RecordBuf, ) -> Result { Ok(VariantInfo { variant_type: common::noodles::get_string(record, "variant_type")?, @@ -194,7 +197,7 @@ impl Record { /// Extract details on the variant effects. pub(crate) fn extract_effect_info( - record: &noodles_vcf::Record, + record: &noodles_vcf::variant::RecordBuf, ) -> Result { Ok(EffectInfo { primate_ai_score: common::noodles::get_f32(record, "primate_ai_score").ok(), @@ -208,10 +211,12 @@ impl Record { } /// Extract the filters fields. - pub(crate) fn extract_filters(record: &noodles_vcf::Record) -> Result, anyhow::Error> { + pub(crate) fn extract_filters( + record: &noodles_vcf::variant::RecordBuf, + ) -> Result, anyhow::Error> { Ok( if let Some(Some(field::Value::Array(field::value::Array::String(value)))) = - record.info().get(&field::Key::from_str("filters")?) + record.info().get("filters") { value .iter() @@ -234,7 +239,7 @@ impl Record { /// Extract the age related fields from the VCF record. pub(crate) fn extract_age( - record: &noodles_vcf::record::Record, + record: &noodles_vcf::variant::RecordBuf, ) -> Result { Ok(AgeInfo { age_hist_hom_bin_freq: common::noodles::get_vec::(record, "age_hist_hom_bin_freq") @@ -250,7 +255,7 @@ impl Record { /// Extract the depth related fields from the VCF record. pub(crate) fn extract_depth( - record: &noodles_vcf::record::Record, + record: &noodles_vcf::variant::RecordBuf, ) -> Result { Ok(DepthInfo { dp_hist_all_n_larger: common::noodles::get_i32(record, "dp_hist_all_n_larger").ok(), @@ -264,7 +269,7 @@ impl Record { /// Extract the quality-related fields from the VCF record. pub(crate) fn extract_quality( - record: &noodles_vcf::record::Record, + record: &noodles_vcf::variant::RecordBuf, ) -> Result { Ok(QualityInfo { as_fs: common::noodles::get_f32(record, "AS_FS").ok(), @@ -294,7 +299,7 @@ impl Record { /// Extract the allele counts from the `record` as configured in `options`. pub(crate) fn extract_cohorts_allele_counts( - record: &noodles_vcf::Record, + record: &noodles_vcf::variant::RecordBuf, options: &DetailsOptions, ) -> Result, anyhow::Error> { // Initialize global cohort. We will always extract the non-population specific @@ -367,7 +372,7 @@ impl Record { /// Extrac the population allele counts from the `record`. pub(crate) fn extract_population_allele_counts( - record: &noodles_vcf::Record, + record: &noodles_vcf::variant::RecordBuf, infix: &str, pop: &str, ) -> Result { @@ -403,7 +408,7 @@ impl Record { /// Extract the allele counts from the `record` with the given infix and suffix. pub(crate) fn extract_allele_counts( - record: &noodles_vcf::Record, + record: &noodles_vcf::variant::RecordBuf, infix: &str, suffix: &str, ) -> Result { @@ -427,11 +432,12 @@ mod test { #[test] fn test_record_from_vcf_allele_gnomad_genomes_grch38() -> Result<(), anyhow::Error> { let path_vcf = "tests/gnomad-nuclear/example-genomes-grch38/v3.1/gnomad-genomes.vcf"; - let mut reader_vcf = noodles_vcf::reader::Builder::default().build_from_path(path_vcf)?; + let mut reader_vcf = + noodles_vcf::io::reader::Builder::default().build_from_path(path_vcf)?; let header = reader_vcf.read_header()?; let mut records = Vec::new(); - for row in reader_vcf.records(&header) { + for row in reader_vcf.record_bufs(&header) { let vcf_record = row?; let record = Record::from_vcf_allele(&vcf_record, 0, &DetailsOptions::with_all_enabled())?; diff --git a/src/pbs/gnomad/gnomad4.rs b/src/pbs/gnomad/gnomad4.rs index 7f893af0..8e633a73 100644 --- a/src/pbs/gnomad/gnomad4.rs +++ b/src/pbs/gnomad/gnomad4.rs @@ -1,8 +1,9 @@ //! Code generate for protobufs by `prost-build`. +use noodles_vcf::variant::record::AlternateBases; use std::str::FromStr; -use noodles_vcf::record::info::field; +use noodles_vcf::variant::record_buf::info::field; use super::gnomad3; use crate::common; @@ -58,7 +59,7 @@ impl Record { /// /// The `Record` or an error if the record could not be extracted. pub fn from_vcf_allele( - record: &noodles_vcf::record::Record, + record: &noodles_vcf::variant::RecordBuf, allele_no: usize, options: &gnomad3::DetailsOptions, record_type: RecordType, @@ -68,14 +69,18 @@ impl Record { assert!(allele_no == 0, "only allele 0 is supported"); // Extract mandatory fields. - let chrom = record.chromosome().to_string(); - let pos: usize = record.position().into(); + let chrom = record.reference_sequence_name().to_string(); + let pos: usize = record + .variant_start() + .expect("Telomeric breakends not supported") + .get(); let pos = pos as i32; let ref_allele = record.reference_bases().to_string(); let alt_allele = record .alternate_bases() - .get(allele_no) - .ok_or_else(|| anyhow::anyhow!("no such allele: {}", allele_no))? + .iter() + .nth(allele_no) + .ok_or_else(|| anyhow::anyhow!("no such allele: {}", allele_no))?? .to_string(); let filters = gnomad3::Record::extract_filters(record)?; let allele_counts = Self::extract_cohorts_allele_counts(record, record_type)?; @@ -143,10 +148,10 @@ impl Record { /// Extract the "vep" field into gnomAD v3 `Vep` records. fn extract_vep( - record: &noodles_vcf::Record, + record: &noodles_vcf::variant::RecordBuf, ) -> Result, anyhow::Error> { if let Some(Some(field::Value::Array(field::value::Array::String(v)))) = - record.info().get(&field::Key::from_str("vep")?) + record.info().get("vep") { v.iter() .flat_map(|v| { @@ -167,7 +172,9 @@ impl Record { } /// Extract the VRS infos. - fn extract_vrs_info(record: &noodles_vcf::Record) -> Result { + fn extract_vrs_info( + record: &noodles_vcf::variant::RecordBuf, + ) -> Result { Ok(VrsInfo { allele_ids: common::noodles::get_vec_str(record, "VRS_Allele_IDs").unwrap_or_default(), ends: common::noodles::get_vec_i32(record, "VRS_Ends").unwrap_or_default(), @@ -177,7 +184,9 @@ impl Record { } /// Extract details on the variant effects. - fn extract_effect_info(record: &noodles_vcf::Record) -> Result { + fn extract_effect_info( + record: &noodles_vcf::variant::RecordBuf, + ) -> Result { Ok(EffectInfo { pangolin_largest_ds: common::noodles::get_f32(record, "pangolin_largest_ds").ok(), phylop: common::noodles::get_f32(record, "phylop").ok(), @@ -192,7 +201,7 @@ impl Record { /// Extract the allele counts from the `record` as configured in `options`. fn extract_cohorts_allele_counts( - record: &noodles_vcf::Record, + record: &noodles_vcf::variant::RecordBuf, record_type: RecordType, ) -> Result, anyhow::Error> { // Initialize global cohort. @@ -270,7 +279,7 @@ impl Record { /// Extrac the ancestry group allele counts from the `record`. fn extract_ancestry_group_allele_counts( - record: &noodles_vcf::Record, + record: &noodles_vcf::variant::RecordBuf, infix: &str, grp: &str, ) -> Result { @@ -306,7 +315,7 @@ impl Record { /// Extract the allele counts from the `record` with the given infix and suffix. fn extract_allele_counts( - record: &noodles_vcf::Record, + record: &noodles_vcf::variant::RecordBuf, infix: &str, suffix: &str, ) -> Result { @@ -330,11 +339,12 @@ mod test { #[test] fn test_record_from_vcf_allele_gnomad_genomes_grch38() -> Result<(), anyhow::Error> { let path_vcf = "tests/gnomad-nuclear/example-genomes-grch38/v4.0/gnomad-genomes.vcf"; - let mut reader_vcf = noodles_vcf::reader::Builder::default().build_from_path(path_vcf)?; + let mut reader_vcf = + noodles_vcf::io::reader::Builder::default().build_from_path(path_vcf)?; let header = reader_vcf.read_header()?; let mut records = Vec::new(); - for row in reader_vcf.records(&header) { + for row in reader_vcf.record_bufs(&header) { let vcf_record = row?; let record = Record::from_vcf_allele( &vcf_record, @@ -353,11 +363,12 @@ mod test { #[test] fn test_record_from_vcf_allele_gnomad_exomess_grch38() -> Result<(), anyhow::Error> { let path_vcf = "tests/gnomad-nuclear/example-exomes-grch38/v4.0/gnomad-exomes.vcf"; - let mut reader_vcf = noodles_vcf::reader::Builder::default().build_from_path(path_vcf)?; + let mut reader_vcf = + noodles_vcf::io::reader::Builder::default().build_from_path(path_vcf)?; let header = reader_vcf.read_header()?; let mut records = Vec::new(); - for row in reader_vcf.records(&header) { + for row in reader_vcf.record_bufs(&header) { let vcf_record = row?; let record = Record::from_vcf_allele( &vcf_record, diff --git a/src/pbs/gnomad/mtdna.rs b/src/pbs/gnomad/mtdna.rs index 7c2a4e08..acc1d856 100644 --- a/src/pbs/gnomad/mtdna.rs +++ b/src/pbs/gnomad/mtdna.rs @@ -1,8 +1,9 @@ //! Code generate for protobufs by `prost-build`. +use noodles_vcf::variant::record::AlternateBases; use std::str::FromStr; -use noodles_vcf::record::info::field; +use noodles_vcf::variant::record_buf::info::field; use super::vep_gnomad3::Vep; use crate::common; @@ -68,21 +69,25 @@ impl DetailsOptions { impl Record { /// Creates a new `Record` from a VCF record and allele number. pub fn from_vcf_allele( - record: &noodles_vcf::record::Record, + record: &noodles_vcf::variant::RecordBuf, allele_no: usize, options: &DetailsOptions, ) -> Result { assert!(allele_no == 0, "only allele 0 is supported"); // Extract mandatory fields. - let chrom = record.chromosome().to_string(); - let pos: usize = record.position().into(); + let chrom = record.reference_sequence_name().to_string(); + let pos: usize = record + .variant_start() + .expect("Telomeric breakends not supported") + .get(); let pos = pos as i32; let ref_allele = record.reference_bases().to_string(); let alt_allele = record .alternate_bases() - .get(allele_no) - .ok_or_else(|| anyhow::anyhow!("no such allele: {}", allele_no))? + .iter() + .nth(allele_no) + .ok_or_else(|| anyhow::anyhow!("no such allele: {}", allele_no))?? .to_string(); let variant_collapsed = common::noodles::get_string(record, "variant_collapsed")?; let excluded_ac = common::noodles::get_i32(record, "excluded_AC")?; @@ -164,9 +169,9 @@ impl Record { } /// Extract the "vep" field. - fn extract_vep(record: &noodles_vcf::Record) -> Result, anyhow::Error> { + fn extract_vep(record: &noodles_vcf::variant::RecordBuf) -> Result, anyhow::Error> { if let Some(Some(field::Value::Array(field::value::Array::String(v)))) = - record.info().get(&field::Key::from_str("vep")?) + record.info().get("vep") { v.iter() .flat_map(|v| v.as_ref().map(|s| Vep::from_str(s))) @@ -178,7 +183,7 @@ impl Record { /// Extract the heteroplasmy-related fields from the VCF record. fn extract_heteroplasmy( - record: &noodles_vcf::record::Record, + record: &noodles_vcf::variant::RecordBuf, ) -> Result { Ok(HeteroplasmyInfo { heteroplasmy_below_min_het_threshold_hist: common::noodles::get_vec::( @@ -193,7 +198,7 @@ impl Record { /// Extract the filter histogram related fields form the VCF record. fn extract_filter_histograms( - record: &noodles_vcf::record::Record, + record: &noodles_vcf::variant::RecordBuf, ) -> Result { Ok(FilterHistograms { base_qual_hist: common::noodles::get_vec::(record, "base_qual_hist") @@ -211,7 +216,7 @@ impl Record { /// Extract the population related fields from the VCF record. fn extract_population( - record: &noodles_vcf::record::Record, + record: &noodles_vcf::variant::RecordBuf, ) -> Result { Ok(PopulationInfo { pop_an: common::noodles::get_vec::(record, "pop_AN")?, @@ -226,7 +231,7 @@ impl Record { /// Extract the haplogroup related fields from the VCF record. fn extract_haplogroup( - record: &noodles_vcf::record::Record, + record: &noodles_vcf::variant::RecordBuf, ) -> Result { Ok(HaplogroupInfo { hap_defining_variant: common::noodles::get_flag(record, "hap_defining_variant")?, @@ -245,7 +250,7 @@ impl Record { } /// Extract the age related fields from the VCF record. - fn extract_age(record: &noodles_vcf::record::Record) -> Result { + fn extract_age(record: &noodles_vcf::variant::RecordBuf) -> Result { Ok(AgeInfo { age_hist_hom_bin_freq: common::noodles::get_vec::(record, "age_hist_hom_bin_freq") .unwrap_or_default(), @@ -259,7 +264,7 @@ impl Record { } /// Extract the depth related fields from the VCF record. - fn extract_depth(record: &noodles_vcf::record::Record) -> Result { + fn extract_depth(record: &noodles_vcf::variant::RecordBuf) -> Result { Ok(DepthInfo { dp_hist_all_n_larger: common::noodles::get_i32(record, "dp_hist_all_n_larger").ok(), dp_hist_alt_n_larger: common::noodles::get_i32(record, "dp_hist_alt_n_larger").ok(), @@ -271,7 +276,9 @@ impl Record { } /// Extract the quality-related fields from the VCF record. - fn extract_quality(record: &noodles_vcf::record::Record) -> Result { + fn extract_quality( + record: &noodles_vcf::variant::RecordBuf, + ) -> Result { Ok(QualityInfo { dp_mean: common::noodles::get_f32(record, "dp_mean").ok(), mq_mean: common::noodles::get_f32(record, "mq_mean").ok(), @@ -280,10 +287,12 @@ impl Record { } /// Extract the filters fields. - fn extract_filters(record: &noodles_vcf::Record) -> Result, anyhow::Error> { + fn extract_filters( + record: &noodles_vcf::variant::RecordBuf, + ) -> Result, anyhow::Error> { Ok( if let Some(Some(field::Value::Array(field::value::Array::String(value)))) = - record.info().get(&field::Key::from_str("filters")?) + record.info().get("filters") { value .iter() @@ -309,11 +318,12 @@ mod test { #[test] fn test_record_from_vcf_allele() -> Result<(), anyhow::Error> { let path_vcf = "tests/gnomad-mtdna/example/gnomad-mtdna.vcf"; - let mut reader_vcf = noodles_vcf::reader::Builder::default().build_from_path(path_vcf)?; + let mut reader_vcf = + noodles_vcf::io::reader::Builder::default().build_from_path(path_vcf)?; let header = reader_vcf.read_header()?; let mut records = Vec::new(); - for row in reader_vcf.records(&header) { + for row in reader_vcf.record_bufs(&header) { let vcf_record = row?; let record = Record::from_vcf_allele(&vcf_record, 0, &DetailsOptions::with_all_enabled())?;