diff --git a/Cargo.toml b/Cargo.toml index 0f8c4022..45be6071 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -68,3 +68,4 @@ flatc-rust = "0.2.0" csv = "1.2.0" pretty_assertions = "1.3.0" temp_testdir = "0.2.3" +rstest = "0.17.0" diff --git a/src/annotate/strucvars/maelstrom.rs b/src/annotate/strucvars/maelstrom.rs new file mode 100644 index 00000000..8b98f36b --- /dev/null +++ b/src/annotate/strucvars/maelstrom.rs @@ -0,0 +1,209 @@ +//! Code for reading maelstrom coverage and mapping quality VCF files. + +use noodles::{ + core::{Position, Region}, + vcf, +}; +use std::ops::Deref; +use std::{ + ops::Range, + path::{Path, PathBuf}, +}; + +/// Read access for maelstrom coverage/mapping quality VCF files. +/// +/// This only works robustly for WGS data. Also, it is assumed that each VCF +/// file contains exactly one sample. +pub struct Reader { + /// Path to the VCF file. + pub path: PathBuf, + /// Name of the single sample in the VCF file. + pub sample_name: String, + /// The internal reader. + pub reader: vcf::IndexedReader, + /// The header from the VCF file. + pub header: vcf::header::Header, +} + +impl Reader { + /// Construct reader with given path and sample name. + /// + /// The VCF file at `p` must have an index. + /// + /// # Arguments + /// + /// * `p` - Path to the tabix indexed and bgzip-compressed VCF file. + pub fn from_path

(p: P) -> Result + where + P: AsRef + Clone, + { + let path = p.as_ref().to_path_buf(); + let mut reader = vcf::indexed_reader::Builder::default().build_from_path(&path)?; + let header = reader.read_header()?; + let header: vcf::header::Header = header.parse()?; + let sample = if header.sample_names().len() == 1 { + header + .sample_names() + .iter() + .next() + .expect("just checked for ==1 sample") + .clone() + } else { + anyhow::bail!( + "VCF file {} must contain exactly one sample", + p.as_ref().display() + ); + }; + + Ok(Self { + path, + sample_name: sample, + reader, + header, + }) + } + + /// Read coverage and mapping quality for given range and compute means. + /// + /// # Arguments + /// + /// * `chrom` - Chromosome name. + /// * `range` - 1-based range to read on `chrom`. + /// + /// # Returns + /// + /// The mean coverage and mean mapping quality bundled into one `Record`. + pub fn read(&mut self, chrom: &str, range: Range) -> Result { + // Jump to the given region. + let start: usize = range.start.try_into()?; + let pos_start = Position::try_from(start)?; + let end: usize = range.end.try_into()?; + let pos_end = Position::try_from(end)?; + let region = Region::new(chrom, pos_start..=pos_end); + let query = self.reader.query(&self.header, ®ion)?; + + // Read in the records and compute the mean coverage and mapping quality. The records + // describe the mean statistics over a window. We go all the way to compute fractions + // of windows. + let mut cov_sum = 0f64; + let mut mq_sum = 0f64; + let mut count = 0f64; + + for result in query { + let record = result?; + + let window_end = record.info().get(&vcf::header::info::key::END_POSITION); + use vcf::record::info::field::Value::Integer; + let window_end = if let Some(Some(Integer(window_end))) = window_end { + *window_end as usize + } else { + anyhow::bail!("missing INFO/END in record"); + }; + let window_start: usize = record.position().into(); + let window_size = window_end - window_start + 1; + + // Use first and last window values only in fractions. + let covered = if window_start < start { + window_size - (start - window_start) + } else if window_end > end { + window_size - (window_end - end) + } else { + window_size + }; + let covered: i32 = covered.try_into()?; + let covered: f64 = covered.try_into()?; + let window_size: i32 = window_size.try_into()?; + let window_size: f64 = window_size.try_into()?; + let factor = covered / window_size; + count += factor; + + let genotype = record + .genotypes() + .deref() + .iter() + .next() + .expect("just checked for ==1 sample"); + + // The simplest way to obtain the genotype keys is to iterate and call `as_ref()` on the + // key. + for (key, value) in genotype.deref().iter() { + match (key.as_ref(), value) { + ("CV", Some(vcf::record::genotypes::genotype::field::Value::Float(cov))) => { + cov_sum += factor * (*cov) as f64; + } + ("MQ", Some(vcf::record::genotypes::genotype::field::Value::Float(mq))) => { + mq_sum += factor * (*mq) as f64; + } + // Ignore all other keys. + _ => (), + } + } + } + + Ok(if count > 0f64 { + Record { + mean_coverage: cov_sum / count, + mean_mapq: mq_sum / count, + } + } else { + Default::default() + }) + } +} + +/// Result for `Reader::read`. +#[derive(Debug, Default, Clone, PartialEq)] +pub struct Record { + /// Mean quality. + pub mean_coverage: f64, + /// Mean mapping quality. + pub mean_mapq: f64, +} + +#[cfg(test)] +mod test { + use super::Reader; + use pretty_assertions::assert_eq; + + #[test] + fn reader_from_path() -> Result<(), anyhow::Error> { + let path = "tests/data/annotate/strucvars/maelstrom/example.SAMPLE.cov.vcf.gz"; + let reader = Reader::from_path(path)?; + assert_eq!(reader.sample_name, "SAMPLE"); + assert_eq!(format!("{}", reader.path.display()), path); + + Ok(()) + } + + #[test] + fn reader_read() -> Result<(), anyhow::Error> { + let path = "tests/data/annotate/strucvars/maelstrom/example.SAMPLE.cov.vcf.gz"; + let mut reader = Reader::from_path(path)?; + + { + let record = reader.read("1", 10_001..20_000)?; + assert_eq!(record.mean_coverage, 1.0); + assert_eq!(record.mean_mapq, 40.0); + } + + { + let record = reader.read("1", 10_501..20_000)?; + assert_eq!(record.mean_coverage, 1.0); + assert_eq!(record.mean_mapq, 40.0); + } + + { + let record = reader.read("1", 10_001..19_500)?; + assert_eq!(record.mean_coverage, 1.0); + assert_eq!(record.mean_mapq, 40.0); + } + + { + let record = reader.read("1", 10_501..19_500)?; + assert_eq!(record.mean_coverage, 1.0); + assert_eq!(record.mean_mapq, 40.0); + } + + Ok(()) + } +} diff --git a/src/annotate/strucvars/mod.rs b/src/annotate/strucvars/mod.rs index 79eaf5d1..37585bad 100644 --- a/src/annotate/strucvars/mod.rs +++ b/src/annotate/strucvars/mod.rs @@ -1,6 +1,8 @@ //! Annotation of structural variant VCF files. -use std::collections::HashSet; +pub mod maelstrom; + +use std::collections::{HashMap, HashSet}; use std::fs::OpenOptions; use std::io::{BufReader, Write}; use std::path::Path; @@ -72,9 +74,11 @@ pub struct Args { #[arg(long, default_value_t = 5_000_000)] pub max_fb_tables: usize, - /// Seed for random number generator (UUIDs), if any. + /// Paths to the per-sample VCF files with coverage and mapping quality information as + /// generated by maelstrom-core. #[arg(long)] - pub rng_seed: Option, + pub path_cov_vcf: Vec, + /// Minimal reciprocal overlap to require. #[arg(long, default_value_t = 0.8)] pub min_overlap: f32, @@ -84,6 +88,10 @@ pub struct Args { /// Slack to use around insertions. #[arg(long, default_value_t = 50)] pub slack_ins: i32, + + /// Seed for random number generator (UUIDs), if any. + #[arg(long)] + pub rng_seed: Option, } /// Command line arguments to enforce either `--path-output-vcf` or `--path-output-tsv`. @@ -2387,6 +2395,7 @@ pub fn build_vcf_record_converter>( fn run_vcf_to_jsonl( path_input: &str, tmp_dir: &TempDir, + cov_readers: &mut HashMap, rng: &mut StdRng, ) -> Result<(), anyhow::Error> { tracing::debug!("opening temporary files in {}", tmp_dir.path().display()); @@ -2428,7 +2437,8 @@ fn run_vcf_to_jsonl( rng.fill_bytes(&mut uuid_buf); let uuid = Uuid::from_bytes(uuid_buf); - let record = converter.convert(&record?, uuid, GenomeRelease::Grch37)?; + let mut record = converter.convert(&record?, uuid, GenomeRelease::Grch37)?; + annotate_cov_mq(&mut record, cov_readers)?; if let Some(chromosome_no) = mapping.get(&record.chromosome) { let out_jsonl = &mut tmp_files[*chromosome_no as usize - 1]; jsonl::write(out_jsonl, &record)?; @@ -2443,6 +2453,38 @@ fn run_vcf_to_jsonl( Ok(()) } +/// Annotate `record` with coverage information from `cov_readers`. +fn annotate_cov_mq( + record: &mut VarFishStrucvarTsvRecord, + cov_readers: &mut HashMap, +) -> Result<(), anyhow::Error> { + for entry in record.genotype.entries.iter_mut() { + if let Some(reader) = cov_readers.get_mut(&entry.name) { + // Only read for "linear" SVs (IOW: not for INS or BND). + if matches!(record.sv_type, SvType::Del | SvType::Dup | SvType::Inv) { + let res = reader + .read(&record.chromosome, record.start..record.end) + .map_err(|e| { + anyhow::anyhow!( + "problem querying {}:{}-{} for {}: {}", + &record.chromosome, + record.start, + record.end, + entry.name, + e + ) + })?; + entry.anc = Some(res.mean_coverage as f32); + entry.amq = Some(res.mean_mapq as i32); + } + } else { + tracing::warn!("no coverage information for sample {}", entry.name); + } + } + + Ok(()) +} + /// Read through the JSONL file in `tmp_dir` for contig no. `contig_no` and cluster the records. /// /// # Arguments @@ -2566,12 +2608,24 @@ fn run_with_writer( args: &Args, pedigree: &PedigreeByName, ) -> Result<(), anyhow::Error> { + // Initialize the random number generator from command line seed if given or local entropy + // source. let mut rng = if let Some(rng_seed) = args.rng_seed { StdRng::seed_from_u64(rng_seed) } else { StdRng::from_entropy() }; + // Load maelstrom coverage tracks if given. + tracing::info!("Opening coverage tracks..."); + let mut cov_readers: HashMap = HashMap::new(); + for path_cov in &args.path_cov_vcf { + tracing::debug!(" path: {}", path_cov); + let reader = maelstrom::Reader::from_path(path_cov)?; + cov_readers.insert(reader.sample_name.clone(), reader); + } + tracing::info!("... done opening coverage tracks"); + // Create temporary directory. We will create one temporary file (containing `jsonl` // seriealized `VarFishStrucvarTsvRecord`s) for each SV type and contig. let tmp_dir = TempDir::new("mehari")?; @@ -2579,7 +2633,7 @@ fn run_with_writer( // Read through input VCF files and write out to temporary files. tracing::info!("Input VCF files to temporary files..."); for path_input in args.path_input_vcf.iter() { - run_vcf_to_jsonl(path_input, &tmp_dir, &mut rng)?; + run_vcf_to_jsonl(path_input, &tmp_dir, &mut cov_readers, &mut rng)?; } tracing::info!("... done converting input files."); @@ -2776,9 +2830,11 @@ pub mod bnd { #[cfg(test)] mod test { + use rstest::rstest; use std::fs::File; use chrono::NaiveDate; + use clap_verbosity_flag::Verbosity; use hgvs::static_data::Assembly; use linked_hash_map::LinkedHashMap; use noodles::vcf; @@ -2805,8 +2861,8 @@ mod test { DellyVcfRecordConverter, DragenCnvVcfRecordConverter, DragenSvVcfRecordConverter, GcnvVcfRecordConverter, MantaVcfRecordConverter, PopdelVcfRecordConverter, }, - guess_sv_caller, vcf_header, VarFishStrucvarTsvWriter, VcfHeader, VcfRecord, - VcfRecordConverter, + guess_sv_caller, run, vcf_header, Args, PathOutput, VarFishStrucvarTsvWriter, VcfHeader, + VcfRecord, VcfRecordConverter, }; /// Test for the parsing of breakend alleles. @@ -3404,4 +3460,59 @@ mod test { Ok(()) } + + #[rstest] + #[case(true)] + #[case(false)] + fn test_with_maelstrom_reader(#[case] is_tsv: bool) -> Result<(), anyhow::Error> { + let temp = TempDir::default(); + + let args_common = crate::common::Args { + verbose: Verbosity::new(0, 1), + }; + + let suffix = if is_tsv { ".tsv" } else { ".vcf" }; + let out_path = temp.join(format!("out{}", suffix)); + + let args = Args { + path_db: String::from("tests/data/db/create"), + genome_release: Some(GenomeRelease::Grch37), + path_input_ped: String::from("tests/data/annotate/strucvars/maelstrom/delly2-min.ped"), + path_input_vcf: vec![String::from( + "tests/data/annotate/strucvars/maelstrom/delly2-min.vcf", + )], + output: if is_tsv { + PathOutput { + path_output_vcf: None, + path_output_tsv: Some(format!("{}", out_path.display())), + } + } else { + PathOutput { + path_output_vcf: Some(format!("{}", out_path.display())), + path_output_tsv: None, + } + }, + max_var_count: None, + max_fb_tables: 5_000_000, + path_cov_vcf: vec![String::from( + "tests/data/annotate/strucvars/maelstrom/example.SAMPLE.cov.vcf.gz", + )], + min_overlap: 0.8, + slack_bnd: 50, + slack_ins: 50, + rng_seed: Some(42), + }; + + run(&args_common, &args)?; + + let expected = std::fs::read_to_string(format!( + "tests/data/annotate/strucvars/maelstrom/delly2-min-with-maelstrom{}", + suffix + ))?; + let actual = std::fs::read_to_string(out_path)?; + + assert_eq!(actual, expected); + + Ok(()) + } } diff --git a/tests/data/annotate/strucvars/maelstrom/delly2-min-with-maelstrom.tsv b/tests/data/annotate/strucvars/maelstrom/delly2-min-with-maelstrom.tsv new file mode 100644 index 00000000..3ba40466 --- /dev/null +++ b/tests/data/annotate/strucvars/maelstrom/delly2-min-with-maelstrom.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac8e17de77e9fabc1dca4b7da0f4e823c1fd219dbccc102e80b95d9432aa88b2 +size 424 diff --git a/tests/data/annotate/strucvars/maelstrom/delly2-min-with-maelstrom.vcf b/tests/data/annotate/strucvars/maelstrom/delly2-min-with-maelstrom.vcf new file mode 100644 index 00000000..c8eb6aaf --- /dev/null +++ b/tests/data/annotate/strucvars/maelstrom/delly2-min-with-maelstrom.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14c0bc59f93f06ba1357e9396dc4917ca4507d034c5ae1b4905e52fcb9eba27f +size 4403 diff --git a/tests/data/annotate/strucvars/maelstrom/delly2-min.ped b/tests/data/annotate/strucvars/maelstrom/delly2-min.ped new file mode 100644 index 00000000..ee9d707c --- /dev/null +++ b/tests/data/annotate/strucvars/maelstrom/delly2-min.ped @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41d1d79a956945e2b923979bec0d90467c3e2e4783e3fc5d34c7b188883aab9b +size 19 diff --git a/tests/data/annotate/strucvars/maelstrom/delly2-min.vcf b/tests/data/annotate/strucvars/maelstrom/delly2-min.vcf new file mode 100644 index 00000000..fc1a5718 --- /dev/null +++ b/tests/data/annotate/strucvars/maelstrom/delly2-min.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d1219ed597e38ecff436a914d6c211c388c9ae4e6e9aa04862368fe9b9f315e +size 7441 diff --git a/tests/data/annotate/strucvars/maelstrom/example.SAMPLE.cov.vcf.gz b/tests/data/annotate/strucvars/maelstrom/example.SAMPLE.cov.vcf.gz new file mode 100644 index 00000000..4bcf1123 --- /dev/null +++ b/tests/data/annotate/strucvars/maelstrom/example.SAMPLE.cov.vcf.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c805e01c492c5e8c14fa9191e43a6c9cf3b8cce39ca76228dfd991828dcc345 +size 212635 diff --git a/tests/data/annotate/strucvars/maelstrom/example.SAMPLE.cov.vcf.gz.tbi b/tests/data/annotate/strucvars/maelstrom/example.SAMPLE.cov.vcf.gz.tbi new file mode 100644 index 00000000..aaabab17 --- /dev/null +++ b/tests/data/annotate/strucvars/maelstrom/example.SAMPLE.cov.vcf.gz.tbi @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d11adabcbef60488192c4ee91abafccf4904c6199813939068071eb0ab6ec0e7 +size 3501