Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe committed Oct 6, 2023
1 parent 4626b8f commit d5ca076
Show file tree
Hide file tree
Showing 22 changed files with 795 additions and 91 deletions.
8 changes: 8 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ tracing-subscriber = "0.3"
uuid = { version = "1.4", features = ["v4", "fast-rng", "serde"] }
noodles-vcf = "0.40.0"
rocksdb = { version = "0.21.0", features = ["multi-threaded-cf"] }
noodles-bgzf = "0.24.0"


[build-dependencies]
Expand All @@ -57,6 +58,7 @@ prost-build = "0.12"
[dev-dependencies]
file_diff = "1.0"
float-cmp = "0.9.0"
hxdmp = "0.2.1"
insta = { version = "1.32", features = ["yaml"] }
pretty_assertions = "1.4"
rstest = "0.18.2"
Expand Down
4 changes: 1 addition & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,7 @@ The command interprets the following fields which are written out by the commonl
- `FORMAT/GQ` -- genotype quality
- `FORMAT/DP` -- total read coverage
- `FORMAT/AD` -- allelic depth, one value per allele (including reference0)
- `FORMAT/PID` -- physical phasing information as written out by GATK HaplotypeCaller in GVCF workflow
- `FORMAT/PS` -- physical phasing information as written out by Dragen variant caller
- this field fill be written as `FORMAT/PID`
- `FORMAT/PS` -- physical phasing information as written out by GATK HaplotypeCaller in GVCF workflow and Dragen variant caller
- `FORMAT/SQ` -- "somatic quality" for each alternate allele, as written out by Illumina Dragen variant caller
- this field will be written as `FORMAT/GQ`

Expand Down
13 changes: 1 addition & 12 deletions src/seqvars/ingest/header.rs
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,6 @@ pub fn build_output_header(
genomebuild: GenomeRelease,
worker_version: &str,
) -> Result<vcf::Header, anyhow::Error> {
use vcf::header::record::value::map::format::Type as FormatType;
use vcf::header::record::value::{
map::{info::Type, Filter, Format, Info},
Map,
Expand Down Expand Up @@ -300,17 +299,7 @@ pub fn build_output_header(
Map::<Format>::from(&key::CONDITIONAL_GENOTYPE_QUALITY),
)
.add_format(key::GENOTYPE, Map::<Format>::from(&key::GENOTYPE))
.add_format(
"PID".parse()?,
Map::<Format>::builder()
.set_number(Number::Count(1))
.set_type(FormatType::String)
.set_description(
"Physical phasing ID information, where each unique ID within a given sample \
(but not across samples) connects records within a phasing group",
)
.build()?,
);
.add_format(key::PHASE_SET, Map::<Format>::from(&key::PHASE_SET));

let mut builder = match genomebuild {
GenomeRelease::Grch37 => add_contigs_37(builder),
Expand Down
106 changes: 61 additions & 45 deletions src/seqvars/ingest/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
use std::sync::{Arc, OnceLock};

use crate::common::{self, GenomeRelease};
use crate::common::{self, open_read_maybe_gz, open_write_maybe_gz, GenomeRelease};
use mehari::annotate::seqvars::provider::MehariProvider;
use noodles_vcf as vcf;
use thousands::Separable;
Expand Down Expand Up @@ -73,27 +73,20 @@ impl Default for KnownFormatKeys {
vcf::record::genotypes::keys::key::CONDITIONAL_GENOTYPE_QUALITY, // GQ
vcf::record::genotypes::keys::key::READ_DEPTH, // DP
vcf::record::genotypes::keys::key::READ_DEPTHS, // AD
"PID".parse().expect("invalid key: PID"),
vcf::record::genotypes::keys::key::PHASE_SET, // PS
],
known_keys: vec![
vcf::record::genotypes::keys::key::GENOTYPE,
vcf::record::genotypes::keys::key::CONDITIONAL_GENOTYPE_QUALITY,
vcf::record::genotypes::keys::key::READ_DEPTH,
vcf::record::genotypes::keys::key::READ_DEPTHS,
"PID".parse().expect("invalid key: PID"),
"PS".parse().expect("invalid key: PS"), // written as PID
"SQ".parse().expect("invalid key: SQ"), // written as AD
vcf::record::genotypes::keys::key::PHASE_SET, // PS
"SQ".parse().expect("invalid key: SQ"), // written as AD
],
known_to_output_map: vec![
(
"PS".parse().expect("invalid key: PS"),
"PID".parse().expect("invalid key: PID"),
),
(
"SQ".parse().expect("invalid key: SQ"),
vcf::record::genotypes::keys::key::CONDITIONAL_GENOTYPE_QUALITY,
),
]
known_to_output_map: vec![(
"SQ".parse().expect("invalid key: SQ"),
vcf::record::genotypes::keys::key::CONDITIONAL_GENOTYPE_QUALITY,
)]
.into_iter()
.collect(),
}
Expand Down Expand Up @@ -132,15 +125,6 @@ fn transform_format_value(
_ => return None, // unreachable!("FORMAT/AD must be array of integer"),
}
}
"PS" => {
// PS is written as PID.
match *value {
vcf::record::genotypes::sample::Value::Integer(ps_value) => {
vcf::record::genotypes::sample::Value::String(format!("{}", ps_value))
}
_ => return None, // unreachable!("FORMAT/PS must be integer"),
}
}
"SQ" => {
// SQ is written as AD.
match *value {
Expand Down Expand Up @@ -218,13 +202,17 @@ fn copy_format(
}

/// Process the variants from `input_reader` to `output_writer`.
fn process_variants(
output_writer: &mut vcf::Writer<std::io::BufWriter<std::fs::File>>,
input_reader: &mut vcf::Reader<std::io::BufReader<std::fs::File>>,
fn process_variants<R, W>(
output_writer: &mut vcf::Writer<W>,
input_reader: &mut vcf::Reader<R>,
output_header: &vcf::Header,
input_header: &vcf::Header,
args: &Args,
) -> Result<(), anyhow::Error> {
) -> Result<(), anyhow::Error>
where
R: std::io::BufRead,
W: std::io::Write,
{
// Open the frequency RocksDB database in read only mode.
tracing::info!("Opening frequency database");
let rocksdb_path = format!(
Expand Down Expand Up @@ -323,8 +311,7 @@ fn process_variants(
let mut output_record = builder.build()?;

// Obtain annonars variant key from current allele for RocksDB lookup.
let vcf_var =
annonars::common::keys::Var::from_vcf_allele(&output_record, allele_no);
let vcf_var = annonars::common::keys::Var::from_vcf_allele(&output_record, 0);

// Skip records with a deletion as alternative allele.
if vcf_var.alternative == "*" {
Expand Down Expand Up @@ -449,11 +436,8 @@ pub fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), anyhow:

tracing::info!("opening input file...");
let mut input_reader = {
let file = std::fs::File::open(&args.path_in)
.map_err(|e| anyhow::anyhow!("could not open input file {}: {}", &args.path_in, e))
.map(std::io::BufReader::new)?;
vcf::reader::Builder
.build_from_reader(file)
.build_from_reader(open_read_maybe_gz(&args.path_in)?)
.map_err(|e| anyhow::anyhow!("could not build VCF reader: {}", e))?
};

Expand All @@ -469,17 +453,7 @@ pub fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), anyhow:
)
.map_err(|e| anyhow::anyhow!("problem building output header: {}", e))?;

let mut output_writer = {
let writer = std::fs::File::create(&args.path_out).map_err(|e| {
anyhow::anyhow!(
"could not output file for writing {}: {}",
&args.path_out,
e
)
})?;
let writer = std::io::BufWriter::new(writer);
vcf::writer::Writer::new(writer)
};
let mut output_writer = { vcf::writer::Writer::new(open_write_maybe_gz(&args.path_out)?) };
output_writer
.write_header(&output_header)
.map_err(|e| anyhow::anyhow!("problem writing header: {}", e))?;
Expand All @@ -501,6 +475,8 @@ pub fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), anyhow:

#[cfg(test)]
mod test {
use std::io::Read;

use rstest::rstest;

use crate::common::GenomeRelease;
Expand Down Expand Up @@ -544,4 +520,44 @@ mod test {

Ok(())
}

fn read_to_bytes<P>(path: P) -> Result<Vec<u8>, anyhow::Error>
where
P: AsRef<std::path::Path>,
{
let mut f = std::fs::File::open(&path).expect("no file found");
let metadata = std::fs::metadata(&path).expect("unable to read metadata");
let mut buffer = vec![0; metadata.len() as usize];
f.read(&mut buffer).expect("buffer overflow");
Ok(buffer)
}

#[test]
fn result_snapshot_test_gz() -> Result<(), anyhow::Error> {
let tmpdir = temp_testdir::TempDir::default();

let path_in: String = "tests/seqvars/ingest/NA12878_dragen.vcf.gz".into();
let path_ped = path_in.replace(".vcf.gz", ".ped");
let path_out = tmpdir
.join("out.vcf.gz")
.to_str()
.expect("invalid path")
.into();
let args_common = Default::default();
let args = super::Args {
max_var_count: None,
path_mehari_db: "tests/seqvars/ingest/db".into(),
path_ped,
genomebuild: GenomeRelease::Grch37,
path_in,
path_out,
};
super::run(&args_common, &args)?;

let mut buffer = Vec::new();
hxdmp::hexdump(&read_to_bytes(&args.path_out)?, &mut buffer)?;
insta::assert_snapshot!(String::from_utf8_lossy(&buffer));

Ok(())
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ expression: "std::fs::read_to_string(out_path_str)?"
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Conditional genotype quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Phase set">
##contig=<ID=1,length=249250621,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=2,length=243199373,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=3,length=198022430,assembly="GRCh37",species="Homo sapiens">
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ expression: "std::fs::read_to_string(out_path_str)?"
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Conditional genotype quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Phase set">
##contig=<ID=1,length=249250621,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=2,length=243199373,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=3,length=198022430,assembly="GRCh37",species="Homo sapiens">
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ expression: "std::fs::read_to_string(out_path_str)?"
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Conditional genotype quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Phase set">
##contig=<ID=1,length=249250621,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=2,length=243199373,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=3,length=198022430,assembly="GRCh37",species="Homo sapiens">
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ expression: "std::fs::read_to_string(out_path_str)?"
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Conditional genotype quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Phase set">
##contig=<ID=1,length=249250621,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=2,length=243199373,assembly="GRCh37",species="Homo sapiens">
##contig=<ID=3,length=198022430,assembly="GRCh37",species="Homo sapiens">
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ expression: "std::fs::read_to_string(out_path_str)?"
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Conditional genotype quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Phase set">
##contig=<ID=chr1,length=248956422,assembly="GRCh38",species="Homo sapiens">
##contig=<ID=chr2,length=242193529,assembly="GRCh38",species="Homo sapiens">
##contig=<ID=chr3,length=198295559,assembly="GRCh38",species="Homo sapiens">
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ expression: "std::fs::read_to_string(out_path_str)?"
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Conditional genotype quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Phase set">
##contig=<ID=chr1,length=248956422,assembly="GRCh38",species="Homo sapiens">
##contig=<ID=chr2,length=242193529,assembly="GRCh38",species="Homo sapiens">
##contig=<ID=chr3,length=198295559,assembly="GRCh38",species="Homo sapiens">
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ expression: "std::fs::read_to_string(out_path_str)?"
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Conditional genotype quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Phase set">
##contig=<ID=chr1,length=248956422,assembly="GRCh38",species="Homo sapiens">
##contig=<ID=chr2,length=242193529,assembly="GRCh38",species="Homo sapiens">
##contig=<ID=chr3,length=198295559,assembly="GRCh38",species="Homo sapiens">
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ expression: "std::fs::read_to_string(out_path_str)?"
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Conditional genotype quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Phase set">
##contig=<ID=chr1,length=248956422,assembly="GRCh38",species="Homo sapiens">
##contig=<ID=chr2,length=242193529,assembly="GRCh38",species="Homo sapiens">
##contig=<ID=chr3,length=198295559,assembly="GRCh38",species="Homo sapiens">
Expand Down
Loading

0 comments on commit d5ca076

Please sign in to comment.