-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: implement "seqvars ingest" command (#199)
- Loading branch information
Showing
21 changed files
with
1,411 additions
and
0 deletions.
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
use noodles_vcf as vcf; | ||
|
||
use crate::common::GenomeRelease; | ||
|
||
/// Enumeration for the known variant callers. | ||
#[derive(Debug, Clone, PartialEq, Eq, serde::Deserialize, serde::Serialize)] | ||
pub enum VariantCaller { | ||
GatkHaplotypeCaller { version: String }, | ||
GatkUnifiedGenotyper { version: String }, | ||
Dragen { version: String }, | ||
Other, | ||
} | ||
|
||
impl VariantCaller { | ||
pub fn guess(header: &vcf::Header) -> Option<Self> { | ||
for (other, collection) in header.other_records() { | ||
if other.as_ref().starts_with("GATKCommandLine") | ||
|| other.as_ref().starts_with("DRAGENCommandLine") | ||
{ | ||
use vcf::header::record::value::collection::Collection; | ||
if let Collection::Structured(map) = collection { | ||
for (key, values) in map.iter() { | ||
if let ("HaplotypeCaller", Some(version)) = | ||
(key.as_str(), values.other_fields().get("Version").cloned()) | ||
{ | ||
return Some(VariantCaller::GatkHaplotypeCaller { version }); | ||
} | ||
if let ("UnifiedGenotyper", Some(version)) = | ||
(key.as_str(), values.other_fields().get("Version").cloned()) | ||
{ | ||
return Some(VariantCaller::GatkUnifiedGenotyper { version }); | ||
} | ||
if let ("dragen", Some(version)) = | ||
(key.as_str(), values.other_fields().get("Version").cloned()) | ||
{ | ||
return Some(VariantCaller::Dragen { version }); | ||
} | ||
} | ||
} | ||
} | ||
} | ||
None | ||
} | ||
} | ||
|
||
/// Generate the output header from the input header. | ||
pub fn build_output_header( | ||
input_header: &vcf::Header, | ||
genomebuild: GenomeRelease, | ||
Check warning on line 49 in src/seqvars/ingest/header.rs GitHub Actions / clippyunused variable: `genomebuild`
|
||
) -> Result<vcf::Header, anyhow::Error> { | ||
let variant_caller = VariantCaller::guess(input_header) | ||
Check warning on line 51 in src/seqvars/ingest/header.rs GitHub Actions / clippyunused variable: `variant_caller`
|
||
.ok_or_else(|| anyhow::anyhow!("Unable to guess variant caller"))?; | ||
todo!() | ||
} | ||
|
||
#[cfg(test)] | ||
mod test { | ||
use rstest::rstest; | ||
|
||
use super::VariantCaller; | ||
|
||
macro_rules! set_snapshot_suffix { | ||
($($expr:expr),*) => { | ||
let mut settings = insta::Settings::clone_current(); | ||
settings.set_snapshot_suffix(format!($($expr,)*)); | ||
let _guard = settings.bind_to_scope(); | ||
} | ||
} | ||
|
||
#[rstest] | ||
#[case("tests/seqvars/ingest/example_dragen.07.021.624.3.10.4.vcf")] | ||
#[case("tests/seqvars/ingest/example_dragen.07.021.624.3.10.9.vcf")] | ||
#[case("tests/seqvars/ingest/example_gatk_hc.3.7-0.vcf")] | ||
#[case("tests/seqvars/ingest/example_gatk_hc.4.4.0.0.vcf")] | ||
fn variant_caller_guess(#[case] path: &str) -> Result<(), anyhow::Error> { | ||
set_snapshot_suffix!("{:?}", path.split('/').last().unwrap()); | ||
|
||
let vcf_header = noodles_vcf::reader::Builder::default() | ||
.build_from_path(path)? | ||
.read_header()?; | ||
|
||
insta::assert_yaml_snapshot!(VariantCaller::guess(&vcf_header)); | ||
|
||
Ok(()) | ||
} | ||
|
||
#[rstest] | ||
#[case("tests/seqvars/ingest/example_dragen.07.021.624.3.10.4.vcf")] | ||
#[case("tests/seqvars/ingest/example_dragen.07.021.624.3.10.9.vcf")] | ||
#[case("tests/seqvars/ingest/example_gatk_hc.3.7-0.vcf")] | ||
#[case("tests/seqvars/ingest/example_gatk_hc.4.4.0.0.vcf")] | ||
fn build_output_header(#[case] path: &str) -> Result<(), anyhow::Error> { | ||
set_snapshot_suffix!("{:?}", path.split('/').last().unwrap()); | ||
let tmpdir = temp_testdir::TempDir::default(); | ||
|
||
let input_vcf_header = noodles_vcf::reader::Builder::default() | ||
.build_from_path(path)? | ||
.read_header()?; | ||
let output_vcf_header = | ||
super::build_output_header(&input_vcf_header, crate::common::GenomeRelease::Grch37)?; | ||
|
||
let out_path = tmpdir.join("out.vcf"); | ||
let out_path_str = out_path.to_str().expect("invalid path"); | ||
{ | ||
noodles_vcf::writer::Writer::new(std::fs::File::create(out_path_str)?) | ||
.write_header(&input_vcf_header)?; | ||
} | ||
|
||
insta::assert_snapshot!(std::fs::read_to_string(out_path_str)?); | ||
|
||
Ok(()) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
//! Implementation of `seqvars ingest` subcommand. | ||
use crate::common::{self, GenomeRelease}; | ||
use noodles_vcf as vcf; | ||
|
||
pub mod header; | ||
|
||
/// Command line arguments for `seqvars ingest` subcommand. | ||
#[derive(Debug, clap::Parser)] | ||
#[command(author, version, about = "ingest sequence variant VCF", long_about = None)] | ||
pub struct Args { | ||
/// The assumed genome build. | ||
#[clap(long)] | ||
pub genomebuild: GenomeRelease, | ||
/// Path to input file. | ||
#[clap(long)] | ||
pub path_in: String, | ||
/// Path to output file. | ||
#[clap(long)] | ||
pub path_out: String, | ||
} | ||
|
||
/// Main entry point for `seqvars ingest` sub command. | ||
pub fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), anyhow::Error> { | ||
let before_anything = std::time::Instant::now(); | ||
tracing::info!("args_common = {:#?}", &args_common); | ||
tracing::info!("args = {:#?}", &args); | ||
|
||
common::trace_rss_now(); | ||
|
||
tracing::info!("opening input file..."); | ||
let mut input_reader = { | ||
let file = std::fs::File::open(&args.path_in) | ||
.map_err(|e| anyhow::anyhow!("could not open input file {}: {}", &args.path_in, e)) | ||
.map(std::io::BufReader::new)?; | ||
vcf::reader::Builder::default() | ||
Check warning on line 36 in src/seqvars/ingest/mod.rs GitHub Actions / clippyuse of `default` to create a unit struct
|
||
.build_from_reader(file) | ||
.map_err(|e| anyhow::anyhow!("could not build VCF reader: {}", e))? | ||
}; | ||
|
||
tracing::info!("analyzing header..."); | ||
let input_header = input_reader | ||
.read_header() | ||
.map_err(|e| anyhow::anyhow!("problem reading VCF header: {}", e))?; | ||
let output_header = header::build_output_header(&input_header, args.genomebuild)?; | ||
|
||
let mut output_writer = { | ||
let writer = std::fs::File::create(&args.path_out).map_err(|e| { | ||
anyhow::anyhow!( | ||
"could not output file for writing {}: {}", | ||
&args.path_out, | ||
e | ||
) | ||
})?; | ||
let writer = std::io::BufWriter::new(writer); | ||
vcf::writer::Writer::new(writer) | ||
}; | ||
output_writer.write_header(&output_header)?; | ||
|
||
tracing::info!( | ||
"All of `seqvars ingest` completed in {:?}", | ||
before_anything.elapsed() | ||
); | ||
Ok(()) | ||
} | ||
|
||
#[cfg(test)] | ||
mod test { | ||
use rstest::rstest; | ||
|
||
use crate::common::GenomeRelease; | ||
|
||
macro_rules! set_snapshot_suffix { | ||
($($expr:expr),*) => { | ||
let mut settings = insta::Settings::clone_current(); | ||
settings.set_snapshot_suffix(format!($($expr,)*)); | ||
let _guard = settings.bind_to_scope(); | ||
} | ||
} | ||
|
||
#[rstest] | ||
#[case("tests/seqvars/ingest/example_dragen.07.021.624.3.10.4.vcf")] | ||
#[case("tests/seqvars/ingest/example_dragen.07.021.624.3.10.9.vcf")] | ||
#[case("tests/seqvars/ingest/example_gatk_hc.3.7-0.vcf")] | ||
#[case("tests/seqvars/ingest/example_gatk_hc.4.4.0.0.vcf")] | ||
fn smoke_test_run(#[case] path: &str) { | ||
set_snapshot_suffix!("{:?}", path.split('/').last().unwrap().replace(".", "_")); | ||
|
||
let tmpdir = temp_testdir::TempDir::default(); | ||
|
||
let args_common = Default::default(); | ||
let args = super::Args { | ||
genomebuild: GenomeRelease::Grch37, | ||
path_in: path.into(), | ||
path_out: tmpdir | ||
.join("out.vcf") | ||
.to_str() | ||
.expect("invalid path") | ||
.into(), | ||
}; | ||
super::run(&args_common, &args).unwrap(); | ||
} | ||
} |
Oops, something went wrong.