From 89e4233a270d773588932e8ff84c1ed3db2110a9 Mon Sep 17 00:00:00 2001 From: Manuel Holtgrewe Date: Thu, 5 Oct 2023 14:23:49 +0200 Subject: [PATCH] wip --- src/seqvars/ingest/header.rs | 120 +++++++++++++++++- src/seqvars/ingest/mod.rs | 17 ++- ...@example_dragen.07.021.624.3.10.4.vcf.snap | 2 +- ...@example_dragen.07.021.624.3.10.9.vcf.snap | 2 +- ...t_header_38@example_gatk_hc.3.7-0.vcf.snap | 2 +- ...header_38@example_gatk_hc.4.4.0.0.vcf.snap | 2 +- ...header__test__build_output_header_ped.snap | 58 +++++++++ .../seqvars/ingest/example_gatk_hc.3.7-0.ped | 3 + 8 files changed, 195 insertions(+), 11 deletions(-) rename "src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header_38@\"example_dragen.07.021.624.3.10.4.vcf\".snap" => src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header_38@example_dragen.07.021.624.3.10.4.vcf.snap (98%) rename "src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header_38@\"example_dragen.07.021.624.3.10.9.vcf\".snap" => src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header_38@example_dragen.07.021.624.3.10.9.vcf.snap (98%) rename "src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header_38@\"example_gatk_hc.3.7-0.vcf\".snap" => src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header_38@example_gatk_hc.3.7-0.vcf.snap (97%) rename "src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header_38@\"example_gatk_hc.4.4.0.0.vcf\".snap" => src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header_38@example_gatk_hc.4.4.0.0.vcf.snap (98%) create mode 100644 src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header_ped.snap create mode 100644 tests/seqvars/ingest/example_gatk_hc.3.7-0.ped diff --git a/src/seqvars/ingest/header.rs b/src/seqvars/ingest/header.rs index 6659f606..28590822 100644 --- a/src/seqvars/ingest/header.rs +++ b/src/seqvars/ingest/header.rs @@ -1,3 +1,6 @@ +use std::collections::HashSet; + +use mehari::ped::{Disease, Sex}; use noodles_vcf as vcf; use crate::common::GenomeRelease; @@ -151,9 +154,28 @@ fn add_contigs_38(builder: vcf::header::Builder) -> Result String { + match sex { + Sex::Male => String::from("Male"), + Sex::Female => String::from("Female"), + Sex::Unknown => String::from("Unknown"), + } +} + +// Helper that returns header string value for `disease`. +fn disease_str(disease: Disease) -> String { + match disease { + Disease::Affected => String::from("Affected"), + Disease::Unaffected => String::from("Unaffected"), + Disease::Unknown => String::from("Unknown"), + } +} + /// Generate the output header from the input header. pub fn build_output_header( input_header: &vcf::Header, + pedigree: &Option, genomebuild: GenomeRelease, worker_version: &str, ) -> Result { @@ -285,11 +307,70 @@ pub fn build_output_header( .build()?, ); - let builder = match genomebuild { + let mut builder = match genomebuild { GenomeRelease::Grch37 => add_contigs_37(builder), GenomeRelease::Grch38 => add_contigs_38(builder), }?; + if let Some(pedigree) = pedigree { + let ped_idv = pedigree + .individuals + .iter() + .map(|(name, _)| name.clone()) + .collect::>(); + let input_idv = input_header + .sample_names() + .iter() + .map(|n| n.clone()) + .collect::>(); + if !ped_idv.eq(&input_idv) { + anyhow::bail!( + "pedigree individuals = {:?} != input individuals: {:?}", + &ped_idv, + &input_idv + ) + } + + for name in input_header.sample_names() { + let i = pedigree + .individuals + .get(name) + .expect("checked equality above"); + if input_header.sample_names().contains(&i.name) { + builder = builder.add_sample_name(i.name.clone()); + } + + // Add SAMPLE entry. + builder = builder.insert( + "SAMPLE".parse()?, + noodles_vcf::header::record::Value::Map( + i.name.clone(), + Map::::builder() + .insert("Sex".parse()?, sex_str(i.sex)) + .insert("Disease".parse()?, disease_str(i.disease)) + .build()?, + ), + )?; + + // Add PEDIGREE entry. + let mut map_builder = Map::::builder(); + if let Some(father) = i.father.as_ref() { + map_builder = map_builder.insert("Father".parse()?, father.clone()); + } + if let Some(mother) = i.mother.as_ref() { + map_builder = map_builder.insert("Mother".parse()?, mother.clone()); + } + builder = builder.insert( + "PEDIGREE".parse()?, + noodles_vcf::header::record::Value::Map(i.name.clone(), map_builder.build()?), + )?; + } + } else { + for name in input_header.sample_names() { + builder = builder.add_sample_name(name.clone()); + } + } + use vcf::header::record::value::map::Other; let orig_caller = VariantCaller::guess(input_header) @@ -329,13 +410,12 @@ pub fn build_output_header( )?, }; - // TODO: embed pedigree information - Ok(builder.build()) } #[cfg(test)] mod test { + use mehari::ped::PedigreeByName; use rstest::rstest; use super::VariantCaller; @@ -371,7 +451,7 @@ mod test { #[case("tests/seqvars/ingest/example_gatk_hc.3.7-0.vcf")] #[case("tests/seqvars/ingest/example_gatk_hc.4.4.0.0.vcf")] fn build_output_header_37(#[case] path: &str) -> Result<(), anyhow::Error> { - set_snapshot_suffix!("{:?}", path.split('/').last().unwrap()); + set_snapshot_suffix!("{}", path.split('/').last().unwrap()); let tmpdir = temp_testdir::TempDir::default(); let input_vcf_header = noodles_vcf::reader::Builder::default() @@ -379,6 +459,7 @@ mod test { .read_header()?; let output_vcf_header = super::build_output_header( &input_vcf_header, + &None, crate::common::GenomeRelease::Grch37, "x.y.z", )?; @@ -401,7 +482,7 @@ mod test { #[case("tests/seqvars/ingest/example_gatk_hc.3.7-0.vcf")] #[case("tests/seqvars/ingest/example_gatk_hc.4.4.0.0.vcf")] fn build_output_header_38(#[case] path: &str) -> Result<(), anyhow::Error> { - set_snapshot_suffix!("{:?}", path.split('/').last().unwrap()); + set_snapshot_suffix!("{}", path.split('/').last().unwrap()); let tmpdir = temp_testdir::TempDir::default(); let input_vcf_header = noodles_vcf::reader::Builder::default() @@ -409,6 +490,35 @@ mod test { .read_header()?; let output_vcf_header = super::build_output_header( &input_vcf_header, + &None, + crate::common::GenomeRelease::Grch38, + "x.y.z", + )?; + + let out_path = tmpdir.join("out.vcf"); + let out_path_str = out_path.to_str().expect("invalid path"); + { + noodles_vcf::writer::Writer::new(std::fs::File::create(out_path_str)?) + .write_header(&output_vcf_header)?; + } + + insta::assert_snapshot!(std::fs::read_to_string(out_path_str)?); + + Ok(()) + } + + #[test] + fn build_output_header_ped() -> Result<(), anyhow::Error> { + let tmpdir = temp_testdir::TempDir::default(); + + let pedigree = PedigreeByName::from_path("tests/seqvars/ingest/example_gatk_hc.3.7-0.ped")?; + + let input_vcf_header = noodles_vcf::reader::Builder::default() + .build_from_path("tests/seqvars/ingest/example_gatk_hc.3.7-0.vcf")? + .read_header()?; + let output_vcf_header = super::build_output_header( + &input_vcf_header, + &Some(pedigree), crate::common::GenomeRelease::Grch38, "x.y.z", )?; diff --git a/src/seqvars/ingest/mod.rs b/src/seqvars/ingest/mod.rs index 557a3d94..30868f46 100644 --- a/src/seqvars/ingest/mod.rs +++ b/src/seqvars/ingest/mod.rs @@ -12,6 +12,9 @@ pub struct Args { /// The assumed genome build. #[clap(long)] pub genomebuild: GenomeRelease, + /// Path to the pedigree file. + #[clap(long)] + pub path_ped: String, /// Path to input file. #[clap(long)] pub path_in: String, @@ -37,6 +40,11 @@ pub fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), anyhow: common::trace_rss_now(); + tracing::info!("loading pedigree..."); + let pedigree = mehari::ped::PedigreeByName::from_path(&args.path_ped) + .map_err(|e| anyhow::anyhow!("problem parsing PED file: {}", e))?; + tracing::info!("pedigre = {:#?}", &pedigree); + tracing::info!("opening input file..."); let mut input_reader = { let file = std::fs::File::open(&args.path_in) @@ -51,8 +59,12 @@ pub fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), anyhow: let input_header = input_reader .read_header() .map_err(|e| anyhow::anyhow!("problem reading VCF header: {}", e))?; - let output_header = - header::build_output_header(&input_header, args.genomebuild, worker_version())?; + let output_header = header::build_output_header( + &input_header, + &Some(pedigree), + args.genomebuild, + worker_version(), + )?; let mut output_writer = { let writer = std::fs::File::create(&args.path_out).map_err(|e| { @@ -100,6 +112,7 @@ mod test { let args_common = Default::default(); let args = super::Args { + path_ped: "tests/seqvars/ingest/example_gatk_hc.3.7-0.ped".into(), genomebuild: GenomeRelease::Grch37, path_in: path.into(), path_out: tmpdir diff --git "a/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header_38@\"example_dragen.07.021.624.3.10.4.vcf\".snap" b/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header_38@example_dragen.07.021.624.3.10.4.vcf.snap similarity index 98% rename from "src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header_38@\"example_dragen.07.021.624.3.10.4.vcf\".snap" rename to src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header_38@example_dragen.07.021.624.3.10.4.vcf.snap index af09d34c..501f62ed 100644 --- "a/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header_38@\"example_dragen.07.021.624.3.10.4.vcf\".snap" +++ b/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header_38@example_dragen.07.021.624.3.10.4.vcf.snap @@ -48,5 +48,5 @@ expression: "std::fs::read_to_string(out_path_str)?" ##contig= ##x-varfish-version= ##x-varfish-version= -#CHROM POS ID REF ALT QUAL FILTER INFO +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 diff --git "a/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header_38@\"example_dragen.07.021.624.3.10.9.vcf\".snap" b/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header_38@example_dragen.07.021.624.3.10.9.vcf.snap similarity index 98% rename from "src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header_38@\"example_dragen.07.021.624.3.10.9.vcf\".snap" rename to src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header_38@example_dragen.07.021.624.3.10.9.vcf.snap index 5e9f9bd5..217a78b5 100644 --- "a/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header_38@\"example_dragen.07.021.624.3.10.9.vcf\".snap" +++ b/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header_38@example_dragen.07.021.624.3.10.9.vcf.snap @@ -48,5 +48,5 @@ expression: "std::fs::read_to_string(out_path_str)?" ##contig= ##x-varfish-version= ##x-varfish-version= -#CHROM POS ID REF ALT QUAL FILTER INFO +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT CASE diff --git "a/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header_38@\"example_gatk_hc.3.7-0.vcf\".snap" b/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header_38@example_gatk_hc.3.7-0.vcf.snap similarity index 97% rename from "src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header_38@\"example_gatk_hc.3.7-0.vcf\".snap" rename to src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header_38@example_gatk_hc.3.7-0.vcf.snap index 7cd3b298..8c6ecc4f 100644 --- "a/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header_38@\"example_gatk_hc.3.7-0.vcf\".snap" +++ b/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header_38@example_gatk_hc.3.7-0.vcf.snap @@ -48,5 +48,5 @@ expression: "std::fs::read_to_string(out_path_str)?" ##contig= ##x-varfish-version= ##x-varfish-version= -#CHROM POS ID REF ALT QUAL FILTER INFO +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Case_1_father-N1-DNA1-WGS1 Case_1_index-N1-DNA1-WGS1 Case_1_mother-N1-DNA1-WGS1 diff --git "a/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header_38@\"example_gatk_hc.4.4.0.0.vcf\".snap" b/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header_38@example_gatk_hc.4.4.0.0.vcf.snap similarity index 98% rename from "src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header_38@\"example_gatk_hc.4.4.0.0.vcf\".snap" rename to src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header_38@example_gatk_hc.4.4.0.0.vcf.snap index ca90c579..b3876668 100644 --- "a/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header_38@\"example_gatk_hc.4.4.0.0.vcf\".snap" +++ b/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header_38@example_gatk_hc.4.4.0.0.vcf.snap @@ -48,5 +48,5 @@ expression: "std::fs::read_to_string(out_path_str)?" ##contig= ##x-varfish-version= ##x-varfish-version= -#CHROM POS ID REF ALT QUAL FILTER INFO +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT CASE diff --git a/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header_ped.snap b/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header_ped.snap new file mode 100644 index 00000000..16d943b7 --- /dev/null +++ b/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header_ped.snap @@ -0,0 +1,58 @@ +--- +source: src/seqvars/ingest/header.rs +expression: "std::fs::read_to_string(out_path_str)?" +--- +##fileformat=VCFv4.4 +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##SAMPLE= +##SAMPLE= +##SAMPLE= +##PEDIGREE= +##PEDIGREE= +##PEDIGREE= +##x-varfish-version= +##x-varfish-version= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Case_1_father-N1-DNA1-WGS1 Case_1_index-N1-DNA1-WGS1 Case_1_mother-N1-DNA1-WGS1 + diff --git a/tests/seqvars/ingest/example_gatk_hc.3.7-0.ped b/tests/seqvars/ingest/example_gatk_hc.3.7-0.ped new file mode 100644 index 00000000..9d4c9fc0 --- /dev/null +++ b/tests/seqvars/ingest/example_gatk_hc.3.7-0.ped @@ -0,0 +1,3 @@ +FAM Case_1_index-N1-DNA1-WGS1 Case_1_father-N1-DNA1-WGS1 Case_1_mother-N1-DNA1-WGS1 2 2 +FAM Case_1_father-N1-DNA1-WGS1 0 0 1 1 +FAM Case_1_mother-N1-DNA1-WGS1 0 0 1 1