From 720fac7d8766e2aecd1920321d37daedc8e5e155 Mon Sep 17 00:00:00 2001 From: Manuel Holtgrewe Date: Thu, 5 Oct 2023 10:13:57 +0200 Subject: [PATCH] feat: implement "seqvars ingest" command (#199) --- Cargo.lock | 1 + Cargo.toml | 1 + README.md | 58 +++++++ src/common.rs | 8 + src/main.rs | 23 +++ src/seqvars/ingest/header.rs | 113 ++++++++++++++ src/seqvars/ingest/mod.rs | 103 ++++++++++++ ...e_dragen.07.021.624.3.10.4.vcf\".snap.new" | 124 +++++++++++++++ ...e_dragen.07.021.624.3.10.9.vcf\".snap.new" | 146 ++++++++++++++++++ ...er@\"example_gatk_hc.3.7-0.vcf\".snap.new" | 126 +++++++++++++++ ...@\"example_gatk_hc.4.4.0.0.vcf\".snap.new" | 146 ++++++++++++++++++ ...ample_dragen.07.021.624.3.10.4.vcf\".snap" | 7 + ...ample_dragen.07.021.624.3.10.9.vcf\".snap" | 7 + ...ample_dragen_07_021_624_3_10_9_vcf\".snap" | 7 + ..._guess@\"example_gatk_hc.3.7-0.vcf\".snap" | 7 + ...uess@\"example_gatk_hc.4.4.0.0.vcf\".snap" | 7 + src/seqvars/mod.rs | 1 + .../example_dragen.07.021.624.3.10.4.vcf | 120 ++++++++++++++ .../example_dragen.07.021.624.3.10.9.vcf | 142 +++++++++++++++++ .../seqvars/ingest/example_gatk_hc.3.7-0.vcf | 122 +++++++++++++++ .../ingest/example_gatk_hc.4.4.0.0.vcf | 142 +++++++++++++++++ 21 files changed, 1411 insertions(+) create mode 100644 src/seqvars/ingest/header.rs create mode 100644 src/seqvars/ingest/mod.rs create mode 100644 "src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header@\"example_dragen.07.021.624.3.10.4.vcf\".snap.new" create mode 100644 "src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header@\"example_dragen.07.021.624.3.10.9.vcf\".snap.new" create mode 100644 "src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header@\"example_gatk_hc.3.7-0.vcf\".snap.new" create mode 100644 "src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header@\"example_gatk_hc.4.4.0.0.vcf\".snap.new" create mode 100644 "src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__variant_caller_guess@\"example_dragen.07.021.624.3.10.4.vcf\".snap" create mode 100644 "src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__variant_caller_guess@\"example_dragen.07.021.624.3.10.9.vcf\".snap" create mode 100644 "src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__variant_caller_guess@\"example_dragen_07_021_624_3_10_9_vcf\".snap" create mode 100644 "src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__variant_caller_guess@\"example_gatk_hc.3.7-0.vcf\".snap" create mode 100644 "src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__variant_caller_guess@\"example_gatk_hc.4.4.0.0.vcf\".snap" create mode 100644 src/seqvars/mod.rs create mode 100644 tests/seqvars/ingest/example_dragen.07.021.624.3.10.4.vcf create mode 100644 tests/seqvars/ingest/example_dragen.07.021.624.3.10.9.vcf create mode 100644 tests/seqvars/ingest/example_gatk_hc.3.7-0.vcf create mode 100644 tests/seqvars/ingest/example_gatk_hc.4.4.0.0.vcf diff --git a/Cargo.lock b/Cargo.lock index ce5a457f..af28aa07 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3442,6 +3442,7 @@ dependencies = [ "log", "mehari", "multimap 0.9.0", + "noodles-vcf", "pretty_assertions", "procfs", "prost", diff --git a/Cargo.toml b/Cargo.toml index 376aaddc..bb31bb89 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -47,6 +47,7 @@ thousands = "0.2" tracing = "0.1" tracing-subscriber = "0.3" uuid = { version = "1.4", features = ["v4", "fast-rng", "serde"] } +noodles-vcf = "0.40.0" [build-dependencies] diff --git a/README.md b/README.md index b0827246..f80d0da4 100644 --- a/README.md +++ b/README.md @@ -71,6 +71,64 @@ $ varfish-server-worker db mk-inhouse \ [--path-input-tsvs @IN/path-list2.txt] ``` +## The `seqvars ingest` Command + +This command takes as the input a single VCF file from a (supported) variant caller and converts it into a file for further querying. +The command interprets the following fields which are written out by the commonly used variant callers such as GATK UnifiedGenotyper, GATK HaplotypeCaller, and Illumina Dragen. + +- `FORMAT/AD` -- allelic depth, one value per allele (including reference0) +- `FORMAT/DP` -- total read coverage +- `FORMAT/GQ` -- genotype quality +- `FORMAT/GT` -- genotype +- `FORMAT/PID` -- physical phasing information as written out by GATK HaplotypeCaller in GVCF workflow +- `FORMAT/PIS` -- physical phasing information as written out by Dragen variant caller + - this field fill be written as `FORMAT/PID` +- `FORMAT/SQ` -- "somatic quality" for each alternate allele, as written out by Illumina Dragen variant caller + - this field will be written as `FORMAT/GQ` + +The `seqvars ingest` command will annotate the variants with the following information: + +- gnomAD genomes and exomes allele frequencies +- gnomAD-mtDNA and HelixMtDb allele frequencies +- functional annotation following the [VCF ANN field standard](https://pcingola.github.io/SnpEff/adds/VCFannotationformat_v1.0.pdf) + +The command will emit one output line for each variant allele from the input and each affected gene. +That is, if two variant alleles affect two genes, four records will be written to the output file. +The annotation will be written out for one highest impact. + +Overall, the command will emit the following header rows in addition to the `##contig=` lines. + +``` +##fileformat=VCFv4.2 +##FILTER= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##x-varfish-version= +##x-varfish-version= +##x-varfish-version= +``` + +> [!NOTE] +> The gnomad-mtDNA information is written to the `INFO/gnomdad_genome_*` fields. + +> [!NOTE] +> Future versions of the worker will annotate the worst effect on a MANE select or MANE Clinical transcript. + # Developer Information This section is only relevant for developers of `varfish-server-worker`. diff --git a/src/common.rs b/src/common.rs index 33dea389..fee69574 100644 --- a/src/common.rs +++ b/src/common.rs @@ -23,6 +23,14 @@ pub struct Args { pub verbose: Verbosity, } +impl Default for Args { + fn default() -> Self { + Self { + verbose: Verbosity::new(0, 0), + } + } +} + /// Helper to print the current memory resident set size via `tracing`. pub fn trace_rss_now() { let me = procfs::process::Process::myself().unwrap(); diff --git a/src/main.rs b/src/main.rs index fc2103c6..6ea9bca6 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,6 +2,7 @@ pub mod common; pub mod db; +pub mod seqvars; pub mod sv; use clap::{Args, Parser, Subcommand}; @@ -33,6 +34,8 @@ enum Commands { Db(Db), /// SV filtration related commands. Sv(Sv), + /// Sequence variant related commands. + Seqvars(Seqvars), } /// Parsing of "db *" sub commands. @@ -67,6 +70,21 @@ enum SvCommands { Query(sv::query::Args), } +/// Parsing of "seqvars *" sub commands. +#[derive(Debug, Args)] +#[command(args_conflicts_with_subcommands = true)] +struct Seqvars { + /// The sub command to run + #[command(subcommand)] + command: SeqvarsCommands, +} + +/// Enum supporting the parsing of "sv *" sub commands. +#[derive(Debug, Subcommand)] +enum SeqvarsCommands { + Ingest(seqvars::ingest::Args), +} + fn main() -> Result<(), anyhow::Error> { let cli = Cli::parse(); @@ -98,6 +116,11 @@ fn main() -> Result<(), anyhow::Error> { db::to_bin::cli::run(&cli.common, args)?; } }, + Commands::Seqvars(seqvars) => match &seqvars.command { + SeqvarsCommands::Ingest(args) => { + seqvars::ingest::run(&cli.common, args)?; + } + }, Commands::Sv(sv) => match &sv.command { SvCommands::Query(args) => { sv::query::run(&cli.common, args)?; diff --git a/src/seqvars/ingest/header.rs b/src/seqvars/ingest/header.rs new file mode 100644 index 00000000..6f2c51e3 --- /dev/null +++ b/src/seqvars/ingest/header.rs @@ -0,0 +1,113 @@ +use noodles_vcf as vcf; + +use crate::common::GenomeRelease; + +/// Enumeration for the known variant callers. +#[derive(Debug, Clone, PartialEq, Eq, serde::Deserialize, serde::Serialize)] +pub enum VariantCaller { + GatkHaplotypeCaller { version: String }, + GatkUnifiedGenotyper { version: String }, + Dragen { version: String }, + Other, +} + +impl VariantCaller { + pub fn guess(header: &vcf::Header) -> Option { + for (other, collection) in header.other_records() { + if other.as_ref().starts_with("GATKCommandLine") + || other.as_ref().starts_with("DRAGENCommandLine") + { + use vcf::header::record::value::collection::Collection; + if let Collection::Structured(map) = collection { + for (key, values) in map.iter() { + if let ("HaplotypeCaller", Some(version)) = + (key.as_str(), values.other_fields().get("Version").cloned()) + { + return Some(VariantCaller::GatkHaplotypeCaller { version }); + } + if let ("UnifiedGenotyper", Some(version)) = + (key.as_str(), values.other_fields().get("Version").cloned()) + { + return Some(VariantCaller::GatkUnifiedGenotyper { version }); + } + if let ("dragen", Some(version)) = + (key.as_str(), values.other_fields().get("Version").cloned()) + { + return Some(VariantCaller::Dragen { version }); + } + } + } + } + } + None + } +} + +/// Generate the output header from the input header. +pub fn build_output_header( + input_header: &vcf::Header, + genomebuild: GenomeRelease, +) -> Result { + let variant_caller = VariantCaller::guess(input_header) + .ok_or_else(|| anyhow::anyhow!("Unable to guess variant caller"))?; + todo!() +} + +#[cfg(test)] +mod test { + use rstest::rstest; + + use super::VariantCaller; + + macro_rules! set_snapshot_suffix { + ($($expr:expr),*) => { + let mut settings = insta::Settings::clone_current(); + settings.set_snapshot_suffix(format!($($expr,)*)); + let _guard = settings.bind_to_scope(); + } + } + + #[rstest] + #[case("tests/seqvars/ingest/example_dragen.07.021.624.3.10.4.vcf")] + #[case("tests/seqvars/ingest/example_dragen.07.021.624.3.10.9.vcf")] + #[case("tests/seqvars/ingest/example_gatk_hc.3.7-0.vcf")] + #[case("tests/seqvars/ingest/example_gatk_hc.4.4.0.0.vcf")] + fn variant_caller_guess(#[case] path: &str) -> Result<(), anyhow::Error> { + set_snapshot_suffix!("{:?}", path.split('/').last().unwrap()); + + let vcf_header = noodles_vcf::reader::Builder::default() + .build_from_path(path)? + .read_header()?; + + insta::assert_yaml_snapshot!(VariantCaller::guess(&vcf_header)); + + Ok(()) + } + + #[rstest] + #[case("tests/seqvars/ingest/example_dragen.07.021.624.3.10.4.vcf")] + #[case("tests/seqvars/ingest/example_dragen.07.021.624.3.10.9.vcf")] + #[case("tests/seqvars/ingest/example_gatk_hc.3.7-0.vcf")] + #[case("tests/seqvars/ingest/example_gatk_hc.4.4.0.0.vcf")] + fn build_output_header(#[case] path: &str) -> Result<(), anyhow::Error> { + set_snapshot_suffix!("{:?}", path.split('/').last().unwrap()); + let tmpdir = temp_testdir::TempDir::default(); + + let input_vcf_header = noodles_vcf::reader::Builder::default() + .build_from_path(path)? + .read_header()?; + let output_vcf_header = + super::build_output_header(&input_vcf_header, crate::common::GenomeRelease::Grch37)?; + + let out_path = tmpdir.join("out.vcf"); + let out_path_str = out_path.to_str().expect("invalid path"); + { + noodles_vcf::writer::Writer::new(std::fs::File::create(out_path_str)?) + .write_header(&input_vcf_header)?; + } + + insta::assert_snapshot!(std::fs::read_to_string(out_path_str)?); + + Ok(()) + } +} diff --git a/src/seqvars/ingest/mod.rs b/src/seqvars/ingest/mod.rs new file mode 100644 index 00000000..bd487577 --- /dev/null +++ b/src/seqvars/ingest/mod.rs @@ -0,0 +1,103 @@ +//! Implementation of `seqvars ingest` subcommand. + +use crate::common::{self, GenomeRelease}; +use noodles_vcf as vcf; + +pub mod header; + +/// Command line arguments for `seqvars ingest` subcommand. +#[derive(Debug, clap::Parser)] +#[command(author, version, about = "ingest sequence variant VCF", long_about = None)] +pub struct Args { + /// The assumed genome build. + #[clap(long)] + pub genomebuild: GenomeRelease, + /// Path to input file. + #[clap(long)] + pub path_in: String, + /// Path to output file. + #[clap(long)] + pub path_out: String, +} + +/// Main entry point for `seqvars ingest` sub command. +pub fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), anyhow::Error> { + let before_anything = std::time::Instant::now(); + tracing::info!("args_common = {:#?}", &args_common); + tracing::info!("args = {:#?}", &args); + + common::trace_rss_now(); + + tracing::info!("opening input file..."); + let mut input_reader = { + let file = std::fs::File::open(&args.path_in) + .map_err(|e| anyhow::anyhow!("could not open input file {}: {}", &args.path_in, e)) + .map(std::io::BufReader::new)?; + vcf::reader::Builder::default() + .build_from_reader(file) + .map_err(|e| anyhow::anyhow!("could not build VCF reader: {}", e))? + }; + + tracing::info!("analyzing header..."); + let input_header = input_reader + .read_header() + .map_err(|e| anyhow::anyhow!("problem reading VCF header: {}", e))?; + let output_header = header::build_output_header(&input_header, args.genomebuild)?; + + let mut output_writer = { + let writer = std::fs::File::create(&args.path_out).map_err(|e| { + anyhow::anyhow!( + "could not output file for writing {}: {}", + &args.path_out, + e + ) + })?; + let writer = std::io::BufWriter::new(writer); + vcf::writer::Writer::new(writer) + }; + output_writer.write_header(&output_header)?; + + tracing::info!( + "All of `seqvars ingest` completed in {:?}", + before_anything.elapsed() + ); + Ok(()) +} + +#[cfg(test)] +mod test { + use rstest::rstest; + + use crate::common::GenomeRelease; + + macro_rules! set_snapshot_suffix { + ($($expr:expr),*) => { + let mut settings = insta::Settings::clone_current(); + settings.set_snapshot_suffix(format!($($expr,)*)); + let _guard = settings.bind_to_scope(); + } + } + + #[rstest] + #[case("tests/seqvars/ingest/example_dragen.07.021.624.3.10.4.vcf")] + #[case("tests/seqvars/ingest/example_dragen.07.021.624.3.10.9.vcf")] + #[case("tests/seqvars/ingest/example_gatk_hc.3.7-0.vcf")] + #[case("tests/seqvars/ingest/example_gatk_hc.4.4.0.0.vcf")] + fn smoke_test_run(#[case] path: &str) { + set_snapshot_suffix!("{:?}", path.split('/').last().unwrap().replace(".", "_")); + + let tmpdir = temp_testdir::TempDir::default(); + + let args_common = Default::default(); + let args = super::Args { + genomebuild: GenomeRelease::Grch37, + path_in: path.into(), + path_out: tmpdir + .join("out.vcf") + .to_str() + .expect("invalid path") + .into(), + }; + super::run(&args_common, &args).unwrap(); + } +} diff --git "a/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header@\"example_dragen.07.021.624.3.10.4.vcf\".snap.new" "b/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header@\"example_dragen.07.021.624.3.10.4.vcf\".snap.new" new file mode 100644 index 00000000..84098cd4 --- /dev/null +++ "b/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header@\"example_dragen.07.021.624.3.10.4.vcf\".snap.new" @@ -0,0 +1,124 @@ +--- +source: src/seqvars/ingest/header.rs +assertion_line: 107 +expression: "std::fs::read_to_string(out_path_str)?" +--- +##fileformat=VCFv4.2 +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##DRAGENCommandLine= +##DRAGENCommandLine= +##reference=file:///staging/human/reference/hs37d5/hs37d5.fa.k_21.f_16.m_149/reference.bin +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 + diff --git "a/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header@\"example_dragen.07.021.624.3.10.9.vcf\".snap.new" "b/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header@\"example_dragen.07.021.624.3.10.9.vcf\".snap.new" new file mode 100644 index 00000000..bd9f112c --- /dev/null +++ "b/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header@\"example_dragen.07.021.624.3.10.9.vcf\".snap.new" @@ -0,0 +1,146 @@ +--- +source: src/seqvars/ingest/header.rs +assertion_line: 107 +expression: "std::fs::read_to_string(out_path_str)?" +--- +##fileformat=VCFv4.2 +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##DRAGENCommandLine= +##DRAGENCommandLine= +##reference=file:///staging/human/reference/hs37d5/hs37d5.fa.k_21.f_16.m_149/reference.bin +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT CASE + diff --git "a/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header@\"example_gatk_hc.3.7-0.vcf\".snap.new" "b/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header@\"example_gatk_hc.3.7-0.vcf\".snap.new" new file mode 100644 index 00000000..371c8640 --- /dev/null +++ "b/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header@\"example_gatk_hc.3.7-0.vcf\".snap.new" @@ -0,0 +1,126 @@ +--- +source: src/seqvars/ingest/header.rs +assertion_line: 107 +expression: "std::fs::read_to_string(out_path_str)?" +--- +##fileformat=VCFv4.2 +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##GATKCommandLine.HaplotypeCaller= +##reference=file:///fast/projects/cubit/current/static_data/reference/GRCh37/hs37d5/hs37d5.fa +##SAMPLE= +##SAMPLE= +##SAMPLE= +##PEDIGREE= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Case_1_father-N1-DNA1-WGS1 Case_1_index-N1-DNA1-WGS1 Case_1_mother-N1-DNA1-WGS1 + diff --git "a/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header@\"example_gatk_hc.4.4.0.0.vcf\".snap.new" "b/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header@\"example_gatk_hc.4.4.0.0.vcf\".snap.new" new file mode 100644 index 00000000..40ff83e8 --- /dev/null +++ "b/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__build_output_header@\"example_gatk_hc.4.4.0.0.vcf\".snap.new" @@ -0,0 +1,146 @@ +--- +source: src/seqvars/ingest/header.rs +assertion_line: 107 +expression: "std::fs::read_to_string(out_path_str)?" +--- +##fileformat=VCFv4.2 +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##ALT= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##GATKCommandLine= +##GATKCommandLine= +##GATKCommandLine= +##source=CombineGVCFs +##source=GenotypeGVCFs +##source=HaplotypeCaller +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT CASE + diff --git "a/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__variant_caller_guess@\"example_dragen.07.021.624.3.10.4.vcf\".snap" "b/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__variant_caller_guess@\"example_dragen.07.021.624.3.10.4.vcf\".snap" new file mode 100644 index 00000000..0df41bd3 --- /dev/null +++ "b/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__variant_caller_guess@\"example_dragen.07.021.624.3.10.4.vcf\".snap" @@ -0,0 +1,7 @@ +--- +source: src/seqvars/ingest/header.rs +expression: "VariantCaller::guess(&vcf_header)" +--- +Dragen: + version: "SW: 07.021.624.3.10.4, HW: 07.021.624" + diff --git "a/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__variant_caller_guess@\"example_dragen.07.021.624.3.10.9.vcf\".snap" "b/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__variant_caller_guess@\"example_dragen.07.021.624.3.10.9.vcf\".snap" new file mode 100644 index 00000000..5089a957 --- /dev/null +++ "b/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__variant_caller_guess@\"example_dragen.07.021.624.3.10.9.vcf\".snap" @@ -0,0 +1,7 @@ +--- +source: src/seqvars/ingest/header.rs +expression: "VariantCaller::guess(&vcf_header)" +--- +Dragen: + version: "SW: 07.021.624.3.10.9, HW: 07.021.624" + diff --git "a/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__variant_caller_guess@\"example_dragen_07_021_624_3_10_9_vcf\".snap" "b/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__variant_caller_guess@\"example_dragen_07_021_624_3_10_9_vcf\".snap" new file mode 100644 index 00000000..5089a957 --- /dev/null +++ "b/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__variant_caller_guess@\"example_dragen_07_021_624_3_10_9_vcf\".snap" @@ -0,0 +1,7 @@ +--- +source: src/seqvars/ingest/header.rs +expression: "VariantCaller::guess(&vcf_header)" +--- +Dragen: + version: "SW: 07.021.624.3.10.9, HW: 07.021.624" + diff --git "a/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__variant_caller_guess@\"example_gatk_hc.3.7-0.vcf\".snap" "b/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__variant_caller_guess@\"example_gatk_hc.3.7-0.vcf\".snap" new file mode 100644 index 00000000..d6f9fb36 --- /dev/null +++ "b/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__variant_caller_guess@\"example_gatk_hc.3.7-0.vcf\".snap" @@ -0,0 +1,7 @@ +--- +source: src/seqvars/ingest/header.rs +expression: "VariantCaller::guess(&vcf_header)" +--- +GatkHaplotypeCaller: + version: 3.7-0-gcfedb67 + diff --git "a/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__variant_caller_guess@\"example_gatk_hc.4.4.0.0.vcf\".snap" "b/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__variant_caller_guess@\"example_gatk_hc.4.4.0.0.vcf\".snap" new file mode 100644 index 00000000..aeddbb48 --- /dev/null +++ "b/src/seqvars/ingest/snapshots/varfish_server_worker__seqvars__ingest__header__test__variant_caller_guess@\"example_gatk_hc.4.4.0.0.vcf\".snap" @@ -0,0 +1,7 @@ +--- +source: src/seqvars/ingest/header.rs +expression: "VariantCaller::guess(&vcf_header)" +--- +GatkHaplotypeCaller: + version: 4.4.0.0 + diff --git a/src/seqvars/mod.rs b/src/seqvars/mod.rs new file mode 100644 index 00000000..b3ac910a --- /dev/null +++ b/src/seqvars/mod.rs @@ -0,0 +1 @@ +pub mod ingest; diff --git a/tests/seqvars/ingest/example_dragen.07.021.624.3.10.4.vcf b/tests/seqvars/ingest/example_dragen.07.021.624.3.10.4.vcf new file mode 100644 index 00000000..91cd29a7 --- /dev/null +++ b/tests/seqvars/ingest/example_dragen.07.021.624.3.10.4.vcf @@ -0,0 +1,120 @@ +##fileformat=VCFv4.2 +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##DRAGENCommandLine= +##DRAGENCommandLine= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##reference=file:///staging/human/reference/hs37d5/hs37d5.fa.k_21.f_16.m_149/reference.bin +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 +17 41256074 . CA C 44.34 . AC=1;AF=0.500;AN=2;DP=18;FS=0.000;MQ=243.16;MQRankSum=3.266;QD=2.46;ReadPosRankSum=2.663;SOR=1.112;FractionInformativeReads=0.944 GT:AD:AF:DP:F1R2:F2R1:GQ:PL:GP:PRI:SB:MB:PS 0|1:11,6:0.353:17:8,3:3,3:43:55,0,47:4.4342e+01,2.0325e-04,5.0000e+01:0.00,11.00,14.01:5,6,2,4:4,7,5,1:41256074 +MT 750 . A G . . DP=7835;MQ=167.68;FractionInformativeReads=0.998 GT:SQ:AD:AF:F1R2:F2R1:DP:SB:MB 1/1:98.13:1,7818:1.000:1,3815:0,4003:7819:1,0,4126,3692:1,0,3979,3839 diff --git a/tests/seqvars/ingest/example_dragen.07.021.624.3.10.9.vcf b/tests/seqvars/ingest/example_dragen.07.021.624.3.10.9.vcf new file mode 100644 index 00000000..329baacb --- /dev/null +++ b/tests/seqvars/ingest/example_dragen.07.021.624.3.10.9.vcf @@ -0,0 +1,142 @@ +##fileformat=VCFv4.2 +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##DRAGENCommandLine= +##DRAGENCommandLine= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##reference=file:///staging/human/reference/hs37d5/hs37d5.fa.k_21.f_16.m_149/reference.bin +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT CASE +17 41256074 . CA C 49.57 PASS AC=1;AF=0.500;AN=2;DP=40;FS=1.397;MQ=249.24;MQRankSum=4.084;QD=1.24;ReadPosRankSum=1.804;SOR=1.051;FractionInformativeReads=0.800 GT:AD:AF:DP:F1R2:F2R1:GQ:PL:GP:PRI:SB:MB:PS 0|1:18,14:0.438:32:9,10:9,4:47:60,0,47:4.9567e+01,9.1418e-05,5.0000e+01:0.00,10.00,13.01:10,8,9,5:9,9,9,5:41256074 +MT 750 . A G . PASS DP=5639;MQ=150.72;FractionInformativeReads=0.995 GT:SQ:AD:AF:F1R2:F2R1:DP:SB:MB 1/1:98.13:1,5607:1.000:0,2721:1,2886:5608:0,1,3291,2316:1,0,2879,2728 diff --git a/tests/seqvars/ingest/example_gatk_hc.3.7-0.vcf b/tests/seqvars/ingest/example_gatk_hc.3.7-0.vcf new file mode 100644 index 00000000..430e8919 --- /dev/null +++ b/tests/seqvars/ingest/example_gatk_hc.3.7-0.vcf @@ -0,0 +1,122 @@ +##fileformat=VCFv4.2 +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##GATKCommandLine.HaplotypeCaller= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##reference=file:///fast/projects/cubit/current/static_data/reference/GRCh37/hs37d5/hs37d5.fa +##SAMPLE= +##SAMPLE= +##SAMPLE= +##PEDIGREE= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Case_1_father-N1-DNA1-WGS1 Case_1_index-N1-DNA1-WGS1 Case_1_mother-N1-DNA1-WGS1 +17 41210126 rs542522579 C CTAGCACTT 1136.12 . AC=2;AF=0.333;AN=6;BaseQRankSum=-1.948;ClippingRankSum=0;DB;DP=98;ExcessHet=3.9794;FS=0.876;MLEAC=2;MLEAF=0.333;MQ=60;MQRankSum=0;QD=16.71;ReadPosRankSum=-1.999;SOR=0.527 GT:AD:DP:GQ:PL 0/0:29,0:29:87:0,87,2916 0/1:23,13:36:99:466,0,2205 0/1:15,17:32:99:711,0,1410 +MT 750 rs2853518 A G 263720 . AC=6;AF=1;AN=6;DB;DP=6791;ExcessHet=3.0103;FS=0;MLEAC=6;MLEAF=1;MQ=59.96;QD=28.69;SOR=0.952 GT:AD:DP:GQ:PL 1/1:0,2757:2757:99:95517,8285,0 1/1:0,2392:2392:99:99668,7257,0 1/1:0,1621:1621:99:68561,4944,0 diff --git a/tests/seqvars/ingest/example_gatk_hc.4.4.0.0.vcf b/tests/seqvars/ingest/example_gatk_hc.4.4.0.0.vcf new file mode 100644 index 00000000..f53cd4d2 --- /dev/null +++ b/tests/seqvars/ingest/example_gatk_hc.4.4.0.0.vcf @@ -0,0 +1,142 @@ +##fileformat=VCFv4.2 +##FILTER= +##ALT= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##GATKCommandLine= +##GATKCommandLine= +##GATKCommandLine= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##source=CombineGVCFs +##source=GenotypeGVCFs +##source=HaplotypeCaller +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT CASE +17 41256074 . CA C 3524.03 . AC=2;AF=1;AN=2;AS_BaseQRankSum=.;AS_FS=0;AS_MQ=60;AS_MQRankSum=.;AS_QD=27.59;AS_ReadPosRankSum=.;AS_SOR=2.245;AS_UNIQ_ALT_READ_COUNT=80;DP=88;ExcessHet=0;FS=0;MLEAC=2;MLEAF=1;MQ=60;QD=30.55;SOR=2.245 GT:AD:DP:GQ:PGT:PID:PL:PS 1|1:0,80:80:99:1|1:41256074_CA_C:3538,241,0:41256074 +MT 750 . A G 1143.06 . AC=2;AF=1;AN=2;AS_BaseQRankSum=.;AS_FS=0;AS_MQ=60;AS_MQRankSum=.;AS_QD=32.66;AS_ReadPosRankSum=.;AS_SOR=1.148;AS_UNIQ_ALT_READ_COUNT=35;DP=35;ExcessHet=0;FS=0;MLEAC=2;MLEAF=1;MQ=60;QD=32.66;SOR=1.148 GT:AD:DP:GQ:PL 1/1:0,35:35:99:1157,105,0