From 9fba435691be6498697aed7eae6da310f0761771 Mon Sep 17 00:00:00 2001 From: Manuel Holtgrewe Date: Thu, 20 Apr 2023 16:55:08 +0200 Subject: [PATCH] feat: allow annotation from maelstorm cov/mq BED files (#51) --- src/annotate/strucvars/maelstrom.rs | 209 ++++++++++++++++++ src/annotate/strucvars/mod.rs | 12 +- .../maelstrom/example.SAMPLE.cov.vcf.gz | 3 + .../maelstrom/example.SAMPLE.cov.vcf.gz.tbi | 3 + 4 files changed, 225 insertions(+), 2 deletions(-) create mode 100644 src/annotate/strucvars/maelstrom.rs create mode 100644 tests/data/annotate/strucvars/maelstrom/example.SAMPLE.cov.vcf.gz create mode 100644 tests/data/annotate/strucvars/maelstrom/example.SAMPLE.cov.vcf.gz.tbi diff --git a/src/annotate/strucvars/maelstrom.rs b/src/annotate/strucvars/maelstrom.rs new file mode 100644 index 00000000..610d1e55 --- /dev/null +++ b/src/annotate/strucvars/maelstrom.rs @@ -0,0 +1,209 @@ +//! Code for reading maelstrom coverage and mapping quality VCF files. + +use noodles::{ + core::{Position, Region}, + vcf, +}; +use std::ops::Deref; +use std::{ + ops::Range, + path::{Path, PathBuf}, +}; + +/// Read access for maelstrom coverage/mapping quality VCF files. +/// +/// This only works robustly for WGS data. Also, it is assumed that each VCF +/// file contains exactly one sample. +pub struct Reader { + /// Path to the VCF file. + pub path: PathBuf, + /// Name of the single sample in the VCF file. + pub sample: String, + /// The internal reader. + pub reader: vcf::IndexedReader, + /// The header from the VCF file. + pub header: vcf::header::Header, +} + +impl Reader { + /// Construct reader with given path and sample name. + /// + /// The VCF file at `p` must have an index. + /// + /// # Arguments + /// + /// * `p` - Path to the tabix indexed and bgzip-compressed VCF file. + pub fn from_path

(p: P) -> Result + where + P: AsRef + Clone, + { + let path = p.as_ref().to_path_buf(); + let mut reader = vcf::indexed_reader::Builder::default().build_from_path(&path)?; + let header = reader.read_header()?; + let header: vcf::header::Header = header.parse()?; + let sample = if header.sample_names().len() == 1 { + header + .sample_names() + .iter() + .next() + .expect("just checked for ==1 sample") + .clone() + } else { + anyhow::bail!( + "VCF file {} must contain exactly one sample", + p.as_ref().display() + ); + }; + + Ok(Self { + path, + sample, + reader, + header, + }) + } + + /// Read coverage and mapping quality for given range and compute means. + /// + /// # Arguments + /// + /// * `chrom` - Chromosome name. + /// * `range` - 1-based range to read on `chrom`. + /// + /// # Returns + /// + /// The mean coverage and mean mapping quality bundled into one `Record`. + pub fn read(&mut self, chrom: &str, range: Range) -> Result { + // Jump to the given region. + let start: usize = range.start.try_into()?; + let pos_start = Position::try_from(start)?; + let end: usize = range.end.try_into()?; + let pos_end = Position::try_from(end)?; + let region = Region::new(chrom, pos_start..=pos_end); + let query = self.reader.query(&self.header, ®ion)?; + + // Read in the records and compute the mean coverage and mapping quality. The records + // describe the mean statistics over a window. We go all the way to compute fractions + // of windows. + let mut cov_sum = 0f64; + let mut mq_sum = 0f64; + let mut count = 0f64; + + for result in query { + let record = result?; + + let window_end = record.info().get(&vcf::header::info::key::END_POSITION); + use vcf::record::info::field::Value::Integer; + let window_end = if let Some(Some(Integer(window_end))) = window_end { + *window_end as usize + } else { + anyhow::bail!("missing INFO/END in record"); + }; + let window_start: usize = record.position().into(); + let window_size = window_end - window_start + 1; + + // Use first and last window values only in fractions. + let covered = if window_start < start { + window_size - (start - window_start) + } else if window_end > end { + window_size - (window_end - end) + } else { + window_size + }; + let covered: i32 = covered.try_into()?; + let covered: f64 = covered.try_into()?; + let window_size: i32 = window_size.try_into()?; + let window_size: f64 = window_size.try_into()?; + let factor = covered / window_size; + count += factor; + + let genotype = record + .genotypes() + .deref() + .iter() + .next() + .expect("just checked for ==1 sample"); + + // The simplest way to obtain the genotype keys is to iterate and call `as_ref()` on the + // key. + for (key, value) in genotype.deref().iter() { + match (key.as_ref(), value) { + ("CV", Some(vcf::record::genotypes::genotype::field::Value::Float(cov))) => { + cov_sum += factor * (*cov) as f64; + } + ("MQ", Some(vcf::record::genotypes::genotype::field::Value::Float(mq))) => { + mq_sum += factor * (*mq) as f64; + } + // Ignore all other keys. + _ => (), + } + } + } + + Ok(if count > 0f64 { + Record { + mean_coverage: cov_sum / count, + mean_mapq: mq_sum / count, + } + } else { + Default::default() + }) + } +} + +/// Result for `Reader::read`. +#[derive(Debug, Default, Clone, PartialEq)] +pub struct Record { + /// Mean quality. + pub mean_coverage: f64, + /// Mean mapping quality. + pub mean_mapq: f64, +} + +#[cfg(test)] +mod test { + use super::Reader; + use pretty_assertions::assert_eq; + + #[test] + fn reader_from_path() -> Result<(), anyhow::Error> { + let path = "tests/data/annotate/strucvars/maelstrom/example.SAMPLE.cov.vcf.gz"; + let reader = Reader::from_path(path)?; + assert_eq!(reader.sample, "SAMPLE"); + assert_eq!(format!("{}", reader.path.display()), path); + + Ok(()) + } + + #[test] + fn reader_read() -> Result<(), anyhow::Error> { + let path = "tests/data/annotate/strucvars/maelstrom/example.SAMPLE.cov.vcf.gz"; + let mut reader = Reader::from_path(path)?; + + { + let record = reader.read("1", 10_001..20_000)?; + assert_eq!(record.mean_coverage, 1.0); + assert_eq!(record.mean_mapq, 40.0); + } + + { + let record = reader.read("1", 10_501..20_000)?; + assert_eq!(record.mean_coverage, 1.0); + assert_eq!(record.mean_mapq, 40.0); + } + + { + let record = reader.read("1", 10_001..19_500)?; + assert_eq!(record.mean_coverage, 1.0); + assert_eq!(record.mean_mapq, 40.0); + } + + { + let record = reader.read("1", 10_501..19_500)?; + assert_eq!(record.mean_coverage, 1.0); + assert_eq!(record.mean_mapq, 40.0); + } + + Ok(()) + } +} diff --git a/src/annotate/strucvars/mod.rs b/src/annotate/strucvars/mod.rs index 79eaf5d1..a02578b5 100644 --- a/src/annotate/strucvars/mod.rs +++ b/src/annotate/strucvars/mod.rs @@ -1,5 +1,7 @@ //! Annotation of structural variant VCF files. +pub mod maelstrom; + use std::collections::HashSet; use std::fs::OpenOptions; use std::io::{BufReader, Write}; @@ -72,9 +74,11 @@ pub struct Args { #[arg(long, default_value_t = 5_000_000)] pub max_fb_tables: usize, - /// Seed for random number generator (UUIDs), if any. + /// Paths to the per-sample VCF files with coverage and mapping quality information as + /// generated by maelstrom-core. #[arg(long)] - pub rng_seed: Option, + pub path_cov_vcf: Vec, + /// Minimal reciprocal overlap to require. #[arg(long, default_value_t = 0.8)] pub min_overlap: f32, @@ -84,6 +88,10 @@ pub struct Args { /// Slack to use around insertions. #[arg(long, default_value_t = 50)] pub slack_ins: i32, + + /// Seed for random number generator (UUIDs), if any. + #[arg(long)] + pub rng_seed: Option, } /// Command line arguments to enforce either `--path-output-vcf` or `--path-output-tsv`. diff --git a/tests/data/annotate/strucvars/maelstrom/example.SAMPLE.cov.vcf.gz b/tests/data/annotate/strucvars/maelstrom/example.SAMPLE.cov.vcf.gz new file mode 100644 index 00000000..4bcf1123 --- /dev/null +++ b/tests/data/annotate/strucvars/maelstrom/example.SAMPLE.cov.vcf.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c805e01c492c5e8c14fa9191e43a6c9cf3b8cce39ca76228dfd991828dcc345 +size 212635 diff --git a/tests/data/annotate/strucvars/maelstrom/example.SAMPLE.cov.vcf.gz.tbi b/tests/data/annotate/strucvars/maelstrom/example.SAMPLE.cov.vcf.gz.tbi new file mode 100644 index 00000000..aaabab17 --- /dev/null +++ b/tests/data/annotate/strucvars/maelstrom/example.SAMPLE.cov.vcf.gz.tbi @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d11adabcbef60488192c4ee91abafccf4904c6199813939068071eb0ab6ec0e7 +size 3501