Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: allow annotation from maelstorm cov/mq BED files (#51) #55

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -68,3 +68,4 @@ flatc-rust = "0.2.0"
csv = "1.2.0"
pretty_assertions = "1.3.0"
temp_testdir = "0.2.3"
rstest = "0.17.0"
209 changes: 209 additions & 0 deletions src/annotate/strucvars/maelstrom.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
//! Code for reading maelstrom coverage and mapping quality VCF files.

use noodles::{
core::{Position, Region},
vcf,
};
use std::ops::Deref;
use std::{
ops::Range,
path::{Path, PathBuf},
};

/// Read access for maelstrom coverage/mapping quality VCF files.
///
/// This only works robustly for WGS data. Also, it is assumed that each VCF
/// file contains exactly one sample.
pub struct Reader {
/// Path to the VCF file.
pub path: PathBuf,
/// Name of the single sample in the VCF file.
pub sample_name: String,
/// The internal reader.
pub reader: vcf::IndexedReader<std::fs::File>,
/// The header from the VCF file.
pub header: vcf::header::Header,
}

impl Reader {
/// Construct reader with given path and sample name.
///
/// The VCF file at `p` must have an index.
///
/// # Arguments
///
/// * `p` - Path to the tabix indexed and bgzip-compressed VCF file.
pub fn from_path<P>(p: P) -> Result<Self, anyhow::Error>
where
P: AsRef<Path> + Clone,
{
let path = p.as_ref().to_path_buf();
let mut reader = vcf::indexed_reader::Builder::default().build_from_path(&path)?;
let header = reader.read_header()?;
let header: vcf::header::Header = header.parse()?;
let sample = if header.sample_names().len() == 1 {
header
.sample_names()
.iter()
.next()
.expect("just checked for ==1 sample")
.clone()
} else {
anyhow::bail!(
"VCF file {} must contain exactly one sample",
p.as_ref().display()
);
};

Ok(Self {
path,
sample_name: sample,
reader,
header,
})
}

/// Read coverage and mapping quality for given range and compute means.
///
/// # Arguments
///
/// * `chrom` - Chromosome name.
/// * `range` - 1-based range to read on `chrom`.
///
/// # Returns
///
/// The mean coverage and mean mapping quality bundled into one `Record`.
pub fn read(&mut self, chrom: &str, range: Range<i32>) -> Result<Record, anyhow::Error> {
// Jump to the given region.
let start: usize = range.start.try_into()?;
let pos_start = Position::try_from(start)?;
let end: usize = range.end.try_into()?;
let pos_end = Position::try_from(end)?;
let region = Region::new(chrom, pos_start..=pos_end);
let query = self.reader.query(&self.header, &region)?;

// Read in the records and compute the mean coverage and mapping quality. The records
// describe the mean statistics over a window. We go all the way to compute fractions
// of windows.
let mut cov_sum = 0f64;
let mut mq_sum = 0f64;
let mut count = 0f64;

for result in query {
let record = result?;

let window_end = record.info().get(&vcf::header::info::key::END_POSITION);
use vcf::record::info::field::Value::Integer;
let window_end = if let Some(Some(Integer(window_end))) = window_end {
*window_end as usize
} else {
anyhow::bail!("missing INFO/END in record");
};
let window_start: usize = record.position().into();
let window_size = window_end - window_start + 1;

// Use first and last window values only in fractions.
let covered = if window_start < start {
window_size - (start - window_start)
} else if window_end > end {
window_size - (window_end - end)
} else {
window_size
};
let covered: i32 = covered.try_into()?;
let covered: f64 = covered.try_into()?;
let window_size: i32 = window_size.try_into()?;
let window_size: f64 = window_size.try_into()?;
let factor = covered / window_size;
count += factor;

let genotype = record
.genotypes()
.deref()
.iter()
.next()
.expect("just checked for ==1 sample");

// The simplest way to obtain the genotype keys is to iterate and call `as_ref()` on the
// key.
for (key, value) in genotype.deref().iter() {
match (key.as_ref(), value) {
("CV", Some(vcf::record::genotypes::genotype::field::Value::Float(cov))) => {
cov_sum += factor * (*cov) as f64;
}
("MQ", Some(vcf::record::genotypes::genotype::field::Value::Float(mq))) => {
mq_sum += factor * (*mq) as f64;
}
// Ignore all other keys.
_ => (),
}
}
}

Ok(if count > 0f64 {
Record {
mean_coverage: cov_sum / count,
mean_mapq: mq_sum / count,
}
} else {
Default::default()
})
}
}

/// Result for `Reader::read`.
#[derive(Debug, Default, Clone, PartialEq)]
pub struct Record {
/// Mean quality.
pub mean_coverage: f64,
/// Mean mapping quality.
pub mean_mapq: f64,
}

#[cfg(test)]
mod test {
use super::Reader;
use pretty_assertions::assert_eq;

#[test]
fn reader_from_path() -> Result<(), anyhow::Error> {
let path = "tests/data/annotate/strucvars/maelstrom/example.SAMPLE.cov.vcf.gz";
let reader = Reader::from_path(path)?;
assert_eq!(reader.sample_name, "SAMPLE");
assert_eq!(format!("{}", reader.path.display()), path);

Ok(())
}

#[test]
fn reader_read() -> Result<(), anyhow::Error> {
let path = "tests/data/annotate/strucvars/maelstrom/example.SAMPLE.cov.vcf.gz";
let mut reader = Reader::from_path(path)?;

{
let record = reader.read("1", 10_001..20_000)?;
assert_eq!(record.mean_coverage, 1.0);
assert_eq!(record.mean_mapq, 40.0);
}

{
let record = reader.read("1", 10_501..20_000)?;
assert_eq!(record.mean_coverage, 1.0);
assert_eq!(record.mean_mapq, 40.0);
}

{
let record = reader.read("1", 10_001..19_500)?;
assert_eq!(record.mean_coverage, 1.0);
assert_eq!(record.mean_mapq, 40.0);
}

{
let record = reader.read("1", 10_501..19_500)?;
assert_eq!(record.mean_coverage, 1.0);
assert_eq!(record.mean_mapq, 40.0);
}

Ok(())
}
}
Loading