Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
wip
Browse files Browse the repository at this point in the history
holtgrewe committed Oct 5, 2023
1 parent 417a64f commit 89e4233
Showing 8 changed files with 195 additions and 11 deletions.
120 changes: 115 additions & 5 deletions src/seqvars/ingest/header.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
use std::collections::HashSet;

use mehari::ped::{Disease, Sex};
use noodles_vcf as vcf;

use crate::common::GenomeRelease;
@@ -151,9 +154,28 @@ fn add_contigs_38(builder: vcf::header::Builder) -> Result<vcf::header::Builder,
Ok(builder)
}

/// Helper that returns header string value for `sex`.
fn sex_str(sex: Sex) -> String {
match sex {
Sex::Male => String::from("Male"),
Sex::Female => String::from("Female"),
Sex::Unknown => String::from("Unknown"),
}
}

// Helper that returns header string value for `disease`.
fn disease_str(disease: Disease) -> String {
match disease {
Disease::Affected => String::from("Affected"),
Disease::Unaffected => String::from("Unaffected"),
Disease::Unknown => String::from("Unknown"),
}
}

/// Generate the output header from the input header.
pub fn build_output_header(
input_header: &vcf::Header,
pedigree: &Option<mehari::ped::PedigreeByName>,
genomebuild: GenomeRelease,
worker_version: &str,
) -> Result<vcf::Header, anyhow::Error> {
@@ -285,11 +307,70 @@ pub fn build_output_header(
.build()?,
);

let builder = match genomebuild {
let mut builder = match genomebuild {
GenomeRelease::Grch37 => add_contigs_37(builder),
GenomeRelease::Grch38 => add_contigs_38(builder),
}?;

if let Some(pedigree) = pedigree {
let ped_idv = pedigree
.individuals
.iter()
.map(|(name, _)| name.clone())
.collect::<HashSet<_>>();
let input_idv = input_header
.sample_names()
.iter()
.map(|n| n.clone())
.collect::<HashSet<_>>();
if !ped_idv.eq(&input_idv) {
anyhow::bail!(
"pedigree individuals = {:?} != input individuals: {:?}",
&ped_idv,
&input_idv
)
}

for name in input_header.sample_names() {
let i = pedigree
.individuals
.get(name)
.expect("checked equality above");
if input_header.sample_names().contains(&i.name) {
builder = builder.add_sample_name(i.name.clone());
}

// Add SAMPLE entry.
builder = builder.insert(
"SAMPLE".parse()?,
noodles_vcf::header::record::Value::Map(
i.name.clone(),
Map::<Other>::builder()
.insert("Sex".parse()?, sex_str(i.sex))
.insert("Disease".parse()?, disease_str(i.disease))
.build()?,
),
)?;

// Add PEDIGREE entry.
let mut map_builder = Map::<Other>::builder();
if let Some(father) = i.father.as_ref() {
map_builder = map_builder.insert("Father".parse()?, father.clone());
}
if let Some(mother) = i.mother.as_ref() {
map_builder = map_builder.insert("Mother".parse()?, mother.clone());
}
builder = builder.insert(
"PEDIGREE".parse()?,
noodles_vcf::header::record::Value::Map(i.name.clone(), map_builder.build()?),
)?;
}
} else {
for name in input_header.sample_names() {
builder = builder.add_sample_name(name.clone());
}
}

use vcf::header::record::value::map::Other;

let orig_caller = VariantCaller::guess(input_header)
@@ -329,13 +410,12 @@ pub fn build_output_header(
)?,
};

// TODO: embed pedigree information

Ok(builder.build())
}

#[cfg(test)]
mod test {
use mehari::ped::PedigreeByName;
use rstest::rstest;

use super::VariantCaller;
@@ -371,14 +451,15 @@ mod test {
#[case("tests/seqvars/ingest/example_gatk_hc.3.7-0.vcf")]
#[case("tests/seqvars/ingest/example_gatk_hc.4.4.0.0.vcf")]
fn build_output_header_37(#[case] path: &str) -> Result<(), anyhow::Error> {
set_snapshot_suffix!("{:?}", path.split('/').last().unwrap());
set_snapshot_suffix!("{}", path.split('/').last().unwrap());
let tmpdir = temp_testdir::TempDir::default();

let input_vcf_header = noodles_vcf::reader::Builder::default()
.build_from_path(path)?
.read_header()?;
let output_vcf_header = super::build_output_header(
&input_vcf_header,
&None,
crate::common::GenomeRelease::Grch37,
"x.y.z",
)?;
@@ -401,14 +482,43 @@ mod test {
#[case("tests/seqvars/ingest/example_gatk_hc.3.7-0.vcf")]
#[case("tests/seqvars/ingest/example_gatk_hc.4.4.0.0.vcf")]
fn build_output_header_38(#[case] path: &str) -> Result<(), anyhow::Error> {
set_snapshot_suffix!("{:?}", path.split('/').last().unwrap());
set_snapshot_suffix!("{}", path.split('/').last().unwrap());
let tmpdir = temp_testdir::TempDir::default();

let input_vcf_header = noodles_vcf::reader::Builder::default()
.build_from_path(path)?
.read_header()?;
let output_vcf_header = super::build_output_header(
&input_vcf_header,
&None,
crate::common::GenomeRelease::Grch38,
"x.y.z",
)?;

let out_path = tmpdir.join("out.vcf");
let out_path_str = out_path.to_str().expect("invalid path");
{
noodles_vcf::writer::Writer::new(std::fs::File::create(out_path_str)?)
.write_header(&output_vcf_header)?;
}

insta::assert_snapshot!(std::fs::read_to_string(out_path_str)?);

Ok(())
}

#[test]
fn build_output_header_ped() -> Result<(), anyhow::Error> {
let tmpdir = temp_testdir::TempDir::default();

let pedigree = PedigreeByName::from_path("tests/seqvars/ingest/example_gatk_hc.3.7-0.ped")?;

let input_vcf_header = noodles_vcf::reader::Builder::default()
.build_from_path("tests/seqvars/ingest/example_gatk_hc.3.7-0.vcf")?
.read_header()?;
let output_vcf_header = super::build_output_header(
&input_vcf_header,
&Some(pedigree),
crate::common::GenomeRelease::Grch38,
"x.y.z",
)?;
17 changes: 15 additions & 2 deletions src/seqvars/ingest/mod.rs
Original file line number Diff line number Diff line change
@@ -12,6 +12,9 @@ pub struct Args {
/// The assumed genome build.
#[clap(long)]
pub genomebuild: GenomeRelease,
/// Path to the pedigree file.
#[clap(long)]
pub path_ped: String,
/// Path to input file.
#[clap(long)]
pub path_in: String,
@@ -37,6 +40,11 @@ pub fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), anyhow:

common::trace_rss_now();

tracing::info!("loading pedigree...");
let pedigree = mehari::ped::PedigreeByName::from_path(&args.path_ped)
.map_err(|e| anyhow::anyhow!("problem parsing PED file: {}", e))?;
tracing::info!("pedigre = {:#?}", &pedigree);

tracing::info!("opening input file...");
let mut input_reader = {
let file = std::fs::File::open(&args.path_in)
@@ -51,8 +59,12 @@ pub fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), anyhow:
let input_header = input_reader
.read_header()
.map_err(|e| anyhow::anyhow!("problem reading VCF header: {}", e))?;
let output_header =
header::build_output_header(&input_header, args.genomebuild, worker_version())?;
let output_header = header::build_output_header(
&input_header,
&Some(pedigree),
args.genomebuild,
worker_version(),
)?;

let mut output_writer = {
let writer = std::fs::File::create(&args.path_out).map_err(|e| {
@@ -100,6 +112,7 @@ mod test {

let args_common = Default::default();
let args = super::Args {
path_ped: "tests/seqvars/ingest/example_gatk_hc.3.7-0.ped".into(),
genomebuild: GenomeRelease::Grch37,
path_in: path.into(),
path_out: tmpdir
Original file line number Diff line number Diff line change
@@ -48,5 +48,5 @@ expression: "std::fs::read_to_string(out_path_str)?"
##contig=<ID=chrM,length=16569,assembly="GRCh37",species="Homo sapiens">
##x-varfish-version=<ID=varfish-server-worker,Version="x.y.z">
##x-varfish-version=<ID=orig-caller,Name="Dragen",Version="SW: 07.021.624.3.10.4, HW: 07.021.624">
#CHROM POS ID REF ALT QUAL FILTER INFO
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878

Original file line number Diff line number Diff line change
@@ -48,5 +48,5 @@ expression: "std::fs::read_to_string(out_path_str)?"
##contig=<ID=chrM,length=16569,assembly="GRCh37",species="Homo sapiens">
##x-varfish-version=<ID=varfish-server-worker,Version="x.y.z">
##x-varfish-version=<ID=orig-caller,Name="Dragen",Version="SW: 07.021.624.3.10.9, HW: 07.021.624">
#CHROM POS ID REF ALT QUAL FILTER INFO
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT CASE

Original file line number Diff line number Diff line change
@@ -48,5 +48,5 @@ expression: "std::fs::read_to_string(out_path_str)?"
##contig=<ID=chrM,length=16569,assembly="GRCh37",species="Homo sapiens">
##x-varfish-version=<ID=varfish-server-worker,Version="x.y.z">
##x-varfish-version=<ID=orig-caller,Name="GatkHaplotypeCaller",Version="3.7-0-gcfedb67">
#CHROM POS ID REF ALT QUAL FILTER INFO
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Case_1_father-N1-DNA1-WGS1 Case_1_index-N1-DNA1-WGS1 Case_1_mother-N1-DNA1-WGS1

Original file line number Diff line number Diff line change
@@ -48,5 +48,5 @@ expression: "std::fs::read_to_string(out_path_str)?"
##contig=<ID=chrM,length=16569,assembly="GRCh37",species="Homo sapiens">
##x-varfish-version=<ID=varfish-server-worker,Version="x.y.z">
##x-varfish-version=<ID=orig-caller,Name="GatkHaplotypeCaller",Version="4.4.0.0">
#CHROM POS ID REF ALT QUAL FILTER INFO
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT CASE

Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
---
source: src/seqvars/ingest/header.rs
expression: "std::fs::read_to_string(out_path_str)?"
---
##fileformat=VCFv4.4
##INFO=<ID=gnomad_exomes_an,Number=1,Type=Integer,Description="Number of samples in gnomAD exomes">
##INFO=<ID=gnomad_exomes_hom,Number=1,Type=Integer,Description="Number of hom. alt. carriers in gnomAD exomes">
##INFO=<ID=gnomad_exomes_het,Number=1,Type=Integer,Description="Number of het. alt. carriers in gnomAD exomes">
##INFO=<ID=gnomad_exomes_hemi,Number=1,Type=Integer,Description="Number of hemi. alt. carriers in gnomAD exomes">
##INFO=<ID=gnomad_genomes_an,Number=1,Type=Integer,Description="Number of samples in gnomAD genomes">
##INFO=<ID=gnomad_genomes_hom,Number=1,Type=Integer,Description="Number of hom. alt. carriers in gnomAD genomes">
##INFO=<ID=gnomad_genomes_het,Number=1,Type=Integer,Description="Number of het. alt. carriers in gnomAD genomes">
##INFO=<ID=gnomad_genomes_hemi,Number=1,Type=Integer,Description="Number of hemi. alt. carriers in gnomAD genomes">
##INFO=<ID=helix_an,Number=1,Type=Integer,Description="Number of samples in HelixMtDb">
##INFO=<ID=helix_hom,Number=1,Type=Integer,Description="Number of hom. alt. carriers in HelixMtDb">
##INFO=<ID=helix_het,Number=1,Type=Integer,Description="Number of het. alt. carriers in HelixMtDb">
##INFO=<ID=ANN,Number=.,Type=String,Description="Functional annotations: 'Allele | Annotation | Annotation_Impact | Gene_Name | Gene_ID | Feature_Type | Feature_ID | Transcript_BioType | Rank | HGVS.c | HGVS.p | cDNA.pos / cDNA.length | CDS.pos / CDS.length | AA.pos / AA.length | Distance | ERRORS / WARNINGS / INFO'">
##FILTER=<ID=PASS,Description="All filters passed">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Read depth for each allele">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Conditional genotype quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
##contig=<ID=chr1,length=248956422,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=chr2,length=242193529,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=chr3,length=198295559,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=chr4,length=190214555,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=chr5,length=181538259,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=chr6,length=170805979,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=chr7,length=159345973,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=chr8,length=145138636,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=chr9,length=138394717,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=chr10,length=133797422,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=chr11,length=135086622,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=chr12,length=133275309,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=chr13,length=114364328,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=chr14,length=107043718,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=chr15,length=101991189,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=chr16,length=90338345,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=chr17,length=83257441,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=chr18,length=80373285,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=chr19,length=58617616,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=chr20,length=64444167,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=chr21,length=46709983,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=chr22,length=50818468,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=chrX,length=156040895,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=chrY,length=57227415,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=chrM,length=16569,assembly="GRCh37",species="Homo sapiens">
##SAMPLE=<ID=Case_1_father-N1-DNA1-WGS1,Sex="Male",Disease="Unaffected">
##SAMPLE=<ID=Case_1_index-N1-DNA1-WGS1,Sex="Female",Disease="Affected">
##SAMPLE=<ID=Case_1_mother-N1-DNA1-WGS1,Sex="Male",Disease="Unaffected">
##PEDIGREE=<ID=Case_1_father-N1-DNA1-WGS1>
##PEDIGREE=<ID=Case_1_index-N1-DNA1-WGS1,Father="Case_1_father-N1-DNA1-WGS1",Mother="Case_1_mother-N1-DNA1-WGS1">
##PEDIGREE=<ID=Case_1_mother-N1-DNA1-WGS1>
##x-varfish-version=<ID=varfish-server-worker,Version="x.y.z">
##x-varfish-version=<ID=orig-caller,Name="GatkHaplotypeCaller",Version="3.7-0-gcfedb67">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Case_1_father-N1-DNA1-WGS1 Case_1_index-N1-DNA1-WGS1 Case_1_mother-N1-DNA1-WGS1

3 changes: 3 additions & 0 deletions tests/seqvars/ingest/example_gatk_hc.3.7-0.ped
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
FAM Case_1_index-N1-DNA1-WGS1 Case_1_father-N1-DNA1-WGS1 Case_1_mother-N1-DNA1-WGS1 2 2
FAM Case_1_father-N1-DNA1-WGS1 0 0 1 1
FAM Case_1_mother-N1-DNA1-WGS1 0 0 1 1

0 comments on commit 89e4233

Please sign in to comment.