Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe committed Oct 5, 2023
1 parent 89e4233 commit 70a2b28
Show file tree
Hide file tree
Showing 13 changed files with 388 additions and 149 deletions.
85 changes: 43 additions & 42 deletions src/seqvars/ingest/header.rs
Original file line number Diff line number Diff line change
Expand Up @@ -86,18 +86,30 @@ fn add_contigs_37(builder: vcf::header::Builder) -> Result<vcf::header::Builder,
("20", 63025520),
("21", 48129895),
("22", 51304566),
("X ", 155270560),
("Y ", 59373566),
("X", 155270560),
("Y", 59373566),
("MT", 16569),
];

for (contig, length) in specs {
builder = builder.add_contig(
contig.parse()?,
contig
.parse()
.map_err(|_| anyhow::anyhow!("invalid contig: {}", contig))?,
Map::<Contig>::builder()
.set_length(*length)
.insert("assembly".parse()?, "GRCh37")
.insert("species".parse()?, "Homo sapiens")
.insert(
"assembly"
.parse()
.map_err(|_| anyhow::anyhow!("invalid key: assembly"))?,

Check warning on line 104 in src/seqvars/ingest/header.rs

View check run for this annotation

Codecov / codecov/patch

src/seqvars/ingest/header.rs#L104

Added line #L104 was not covered by tests
"GRCh37",
)
.insert(
"species"
.parse()
.map_err(|_| anyhow::anyhow!("invalid key: species"))?,

Check warning on line 110 in src/seqvars/ingest/header.rs

View check run for this annotation

Codecov / codecov/patch

src/seqvars/ingest/header.rs#L110

Added line #L110 was not covered by tests
"Homo sapiens",
)
.build()?,

Check warning on line 113 in src/seqvars/ingest/header.rs

View check run for this annotation

Codecov / codecov/patch

src/seqvars/ingest/header.rs#L113

Added line #L113 was not covered by tests
);
}
Expand Down Expand Up @@ -142,11 +154,23 @@ fn add_contigs_38(builder: vcf::header::Builder) -> Result<vcf::header::Builder,

for (contig, length) in specs {
builder = builder.add_contig(
contig.parse()?,
contig
.parse()
.map_err(|_| anyhow::anyhow!("invalid contig: {}", contig))?,
Map::<Contig>::builder()
.set_length(*length)
.insert("assembly".parse()?, "GRCh37")
.insert("species".parse()?, "Homo sapiens")
.insert(
"assembly"
.parse()
.map_err(|_| anyhow::anyhow!("invalid key: assembly"))?,

Check warning on line 165 in src/seqvars/ingest/header.rs

View check run for this annotation

Codecov / codecov/patch

src/seqvars/ingest/header.rs#L165

Added line #L165 was not covered by tests
"GRCh38",
)
.insert(
"species"
.parse()
.map_err(|_| anyhow::anyhow!("invalid key: species"))?,

Check warning on line 171 in src/seqvars/ingest/header.rs

View check run for this annotation

Codecov / codecov/patch

src/seqvars/ingest/header.rs#L171

Added line #L171 was not covered by tests
"Homo sapiens",
)
.build()?,

Check warning on line 174 in src/seqvars/ingest/header.rs

View check run for this annotation

Codecov / codecov/patch

src/seqvars/ingest/header.rs#L174

Added line #L174 was not covered by tests
);
}
Expand Down Expand Up @@ -310,7 +334,8 @@ pub fn build_output_header(
let mut builder = match genomebuild {
GenomeRelease::Grch37 => add_contigs_37(builder),
GenomeRelease::Grch38 => add_contigs_38(builder),
}?;
}
.map_err(|e| anyhow::anyhow!("problem adding contigs: {}", e))?;

Check warning on line 338 in src/seqvars/ingest/header.rs

View check run for this annotation

Codecov / codecov/patch

src/seqvars/ingest/header.rs#L338

Added line #L338 was not covered by tests

if let Some(pedigree) = pedigree {
let ped_idv = pedigree
Expand All @@ -321,7 +346,7 @@ pub fn build_output_header(
let input_idv = input_header
.sample_names()
.iter()
.map(|n| n.clone())
.cloned()
.collect::<HashSet<_>>();
if !ped_idv.eq(&input_idv) {
anyhow::bail!(

Check warning on line 352 in src/seqvars/ingest/header.rs

View check run for this annotation

Codecov / codecov/patch

src/seqvars/ingest/header.rs#L352

Added line #L352 was not covered by tests
Expand Down Expand Up @@ -436,7 +461,7 @@ mod test {
fn variant_caller_guess(#[case] path: &str) -> Result<(), anyhow::Error> {
set_snapshot_suffix!("{}", path.split('/').last().unwrap());

let vcf_header = noodles_vcf::reader::Builder::default()
let vcf_header = noodles_vcf::reader::Builder
.build_from_path(path)?

Check warning on line 465 in src/seqvars/ingest/header.rs

View check run for this annotation

Codecov / codecov/patch

src/seqvars/ingest/header.rs#L465

Added line #L465 was not covered by tests
.read_header()?;

Expand All @@ -454,12 +479,14 @@ mod test {
set_snapshot_suffix!("{}", path.split('/').last().unwrap());
let tmpdir = temp_testdir::TempDir::default();

let input_vcf_header = noodles_vcf::reader::Builder::default()
let pedigree = PedigreeByName::from_path(path.replace(".vcf", ".ped")).unwrap();

let input_vcf_header = noodles_vcf::reader::Builder
.build_from_path(path)?

Check warning on line 485 in src/seqvars/ingest/header.rs

View check run for this annotation

Codecov / codecov/patch

src/seqvars/ingest/header.rs#L485

Added line #L485 was not covered by tests
.read_header()?;
let output_vcf_header = super::build_output_header(
&input_vcf_header,
&None,
&Some(pedigree),
crate::common::GenomeRelease::Grch37,
"x.y.z",
)?;
Expand All @@ -485,36 +512,10 @@ mod test {
set_snapshot_suffix!("{}", path.split('/').last().unwrap());
let tmpdir = temp_testdir::TempDir::default();

let input_vcf_header = noodles_vcf::reader::Builder::default()
.build_from_path(path)?
.read_header()?;
let output_vcf_header = super::build_output_header(
&input_vcf_header,
&None,
crate::common::GenomeRelease::Grch38,
"x.y.z",
)?;

let out_path = tmpdir.join("out.vcf");
let out_path_str = out_path.to_str().expect("invalid path");
{
noodles_vcf::writer::Writer::new(std::fs::File::create(out_path_str)?)
.write_header(&output_vcf_header)?;
}

insta::assert_snapshot!(std::fs::read_to_string(out_path_str)?);

Ok(())
}

#[test]
fn build_output_header_ped() -> Result<(), anyhow::Error> {
let tmpdir = temp_testdir::TempDir::default();

let pedigree = PedigreeByName::from_path("tests/seqvars/ingest/example_gatk_hc.3.7-0.ped")?;
let pedigree = PedigreeByName::from_path(path.replace(".vcf", ".ped")).unwrap();

let input_vcf_header = noodles_vcf::reader::Builder::default()
.build_from_path("tests/seqvars/ingest/example_gatk_hc.3.7-0.vcf")?
let input_vcf_header = noodles_vcf::reader::Builder
.build_from_path(path)?

Check warning on line 518 in src/seqvars/ingest/header.rs

View check run for this annotation

Codecov / codecov/patch

src/seqvars/ingest/header.rs#L518

Added line #L518 was not covered by tests
.read_header()?;
let output_vcf_header = super::build_output_header(
&input_vcf_header,
Expand Down
17 changes: 10 additions & 7 deletions src/seqvars/ingest/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ pub fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), anyhow:
let file = std::fs::File::open(&args.path_in)
.map_err(|e| anyhow::anyhow!("could not open input file {}: {}", &args.path_in, e))
.map(std::io::BufReader::new)?;

Check warning on line 52 in src/seqvars/ingest/mod.rs

View check run for this annotation

Codecov / codecov/patch

src/seqvars/ingest/mod.rs#L52

Added line #L52 was not covered by tests
vcf::reader::Builder::default()
vcf::reader::Builder
.build_from_reader(file)
.map_err(|e| anyhow::anyhow!("could not build VCF reader: {}", e))?

Check warning on line 55 in src/seqvars/ingest/mod.rs

View check run for this annotation

Codecov / codecov/patch

src/seqvars/ingest/mod.rs#L55

Added line #L55 was not covered by tests
};
Expand All @@ -64,7 +64,8 @@ pub fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), anyhow:
&Some(pedigree),
args.genomebuild,
worker_version(),
)?;
)
.map_err(|e| anyhow::anyhow!("problem building output header: {}", e))?;

let mut output_writer = {
let writer = std::fs::File::create(&args.path_out).map_err(|e| {
Expand All @@ -77,7 +78,9 @@ pub fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), anyhow:
let writer = std::io::BufWriter::new(writer);
vcf::writer::Writer::new(writer)
};
output_writer.write_header(&output_header)?;
output_writer
.write_header(&output_header)
.map_err(|e| anyhow::anyhow!("problem writing header: {}", e))?;

Check warning on line 83 in src/seqvars/ingest/mod.rs

View check run for this annotation

Codecov / codecov/patch

src/seqvars/ingest/mod.rs#L83

Added line #L83 was not covered by tests

tracing::info!(
"All of `seqvars ingest` completed in {:?}",
Expand Down Expand Up @@ -105,14 +108,14 @@ mod test {
#[case("tests/seqvars/ingest/example_dragen.07.021.624.3.10.9.vcf")]
#[case("tests/seqvars/ingest/example_gatk_hc.3.7-0.vcf")]
#[case("tests/seqvars/ingest/example_gatk_hc.4.4.0.0.vcf")]
fn smoke_test_run(#[case] path: &str) {
set_snapshot_suffix!("{:?}", path.split('/').last().unwrap().replace(".", "_"));
fn smoke_test_run(#[case] path: &str) -> Result<(), anyhow::Error> {
set_snapshot_suffix!("{:?}", path.split('/').last().unwrap().replace('.', "_"));

let tmpdir = temp_testdir::TempDir::default();

let args_common = Default::default();
let args = super::Args {
path_ped: "tests/seqvars/ingest/example_gatk_hc.3.7-0.ped".into(),
path_ped: path.replace(".vcf", ".ped"),
genomebuild: GenomeRelease::Grch37,
path_in: path.into(),
path_out: tmpdir
Expand All @@ -121,6 +124,6 @@ mod test {
.expect("invalid path")
.into(),
};
super::run(&args_common, &args).unwrap();
super::run(&args_common, &args)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
---
source: src/seqvars/ingest/header.rs
expression: "std::fs::read_to_string(out_path_str)?"
---
##fileformat=VCFv4.4
##INFO=<ID=gnomad_exomes_an,Number=1,Type=Integer,Description="Number of samples in gnomAD exomes">
##INFO=<ID=gnomad_exomes_hom,Number=1,Type=Integer,Description="Number of hom. alt. carriers in gnomAD exomes">
##INFO=<ID=gnomad_exomes_het,Number=1,Type=Integer,Description="Number of het. alt. carriers in gnomAD exomes">
##INFO=<ID=gnomad_exomes_hemi,Number=1,Type=Integer,Description="Number of hemi. alt. carriers in gnomAD exomes">
##INFO=<ID=gnomad_genomes_an,Number=1,Type=Integer,Description="Number of samples in gnomAD genomes">
##INFO=<ID=gnomad_genomes_hom,Number=1,Type=Integer,Description="Number of hom. alt. carriers in gnomAD genomes">
##INFO=<ID=gnomad_genomes_het,Number=1,Type=Integer,Description="Number of het. alt. carriers in gnomAD genomes">
##INFO=<ID=gnomad_genomes_hemi,Number=1,Type=Integer,Description="Number of hemi. alt. carriers in gnomAD genomes">
##INFO=<ID=helix_an,Number=1,Type=Integer,Description="Number of samples in HelixMtDb">
##INFO=<ID=helix_hom,Number=1,Type=Integer,Description="Number of hom. alt. carriers in HelixMtDb">
##INFO=<ID=helix_het,Number=1,Type=Integer,Description="Number of het. alt. carriers in HelixMtDb">
##INFO=<ID=ANN,Number=.,Type=String,Description="Functional annotations: 'Allele | Annotation | Annotation_Impact | Gene_Name | Gene_ID | Feature_Type | Feature_ID | Transcript_BioType | Rank | HGVS.c | HGVS.p | cDNA.pos / cDNA.length | CDS.pos / CDS.length | AA.pos / AA.length | Distance | ERRORS / WARNINGS / INFO'">
##FILTER=<ID=PASS,Description="All filters passed">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Read depth for each allele">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Conditional genotype quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
##contig=<ID=1,length=249250621,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=2,length=243199373,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=3,length=198022430,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=4,length=191154276,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=5,length=180915260,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=6,length=171115067,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=7,length=159138663,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=8,length=146364022,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=9,length=141213431,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=10,length=135534747,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=11,length=135006516,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=12,length=133851895,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=13,length=115169878,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=14,length=107349540,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=15,length=102531392,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=16,length=90354753,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=17,length=81195210,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=18,length=78077248,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=19,length=59128983,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=20,length=63025520,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=21,length=48129895,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=22,length=51304566,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=X,length=155270560,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=Y,length=59373566,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=MT,length=16569,assembly="GRCh37",species="Homo sapiens">
##SAMPLE=<ID=NA12878,Sex="Male",Disease="Affected">
##PEDIGREE=<ID=NA12878>
##x-varfish-version=<ID=varfish-server-worker,Version="x.y.z">
##x-varfish-version=<ID=orig-caller,Name="Dragen",Version="SW: 07.021.624.3.10.4, HW: 07.021.624">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878

Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
---
source: src/seqvars/ingest/header.rs
expression: "std::fs::read_to_string(out_path_str)?"
---
##fileformat=VCFv4.4
##INFO=<ID=gnomad_exomes_an,Number=1,Type=Integer,Description="Number of samples in gnomAD exomes">
##INFO=<ID=gnomad_exomes_hom,Number=1,Type=Integer,Description="Number of hom. alt. carriers in gnomAD exomes">
##INFO=<ID=gnomad_exomes_het,Number=1,Type=Integer,Description="Number of het. alt. carriers in gnomAD exomes">
##INFO=<ID=gnomad_exomes_hemi,Number=1,Type=Integer,Description="Number of hemi. alt. carriers in gnomAD exomes">
##INFO=<ID=gnomad_genomes_an,Number=1,Type=Integer,Description="Number of samples in gnomAD genomes">
##INFO=<ID=gnomad_genomes_hom,Number=1,Type=Integer,Description="Number of hom. alt. carriers in gnomAD genomes">
##INFO=<ID=gnomad_genomes_het,Number=1,Type=Integer,Description="Number of het. alt. carriers in gnomAD genomes">
##INFO=<ID=gnomad_genomes_hemi,Number=1,Type=Integer,Description="Number of hemi. alt. carriers in gnomAD genomes">
##INFO=<ID=helix_an,Number=1,Type=Integer,Description="Number of samples in HelixMtDb">
##INFO=<ID=helix_hom,Number=1,Type=Integer,Description="Number of hom. alt. carriers in HelixMtDb">
##INFO=<ID=helix_het,Number=1,Type=Integer,Description="Number of het. alt. carriers in HelixMtDb">
##INFO=<ID=ANN,Number=.,Type=String,Description="Functional annotations: 'Allele | Annotation | Annotation_Impact | Gene_Name | Gene_ID | Feature_Type | Feature_ID | Transcript_BioType | Rank | HGVS.c | HGVS.p | cDNA.pos / cDNA.length | CDS.pos / CDS.length | AA.pos / AA.length | Distance | ERRORS / WARNINGS / INFO'">
##FILTER=<ID=PASS,Description="All filters passed">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Read depth for each allele">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Conditional genotype quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
##contig=<ID=1,length=249250621,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=2,length=243199373,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=3,length=198022430,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=4,length=191154276,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=5,length=180915260,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=6,length=171115067,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=7,length=159138663,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=8,length=146364022,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=9,length=141213431,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=10,length=135534747,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=11,length=135006516,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=12,length=133851895,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=13,length=115169878,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=14,length=107349540,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=15,length=102531392,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=16,length=90354753,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=17,length=81195210,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=18,length=78077248,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=19,length=59128983,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=20,length=63025520,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=21,length=48129895,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=22,length=51304566,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=X,length=155270560,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=Y,length=59373566,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=MT,length=16569,assembly="GRCh37",species="Homo sapiens">
##SAMPLE=<ID=CASE,Sex="Male",Disease="Affected">
##PEDIGREE=<ID=CASE>
##x-varfish-version=<ID=varfish-server-worker,Version="x.y.z">
##x-varfish-version=<ID=orig-caller,Name="Dragen",Version="SW: 07.021.624.3.10.9, HW: 07.021.624">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT CASE

Loading

0 comments on commit 70a2b28

Please sign in to comment.