Skip to content

Commit

Permalink
test data
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe committed Oct 5, 2023
1 parent 0ea5bf9 commit dc84e73
Show file tree
Hide file tree
Showing 10 changed files with 379 additions and 124 deletions.
8 changes: 8 additions & 0 deletions src/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,14 @@ pub struct Args {
pub verbose: Verbosity<InfoLevel>,
}

impl Default for Args {

Check warning on line 26 in src/common.rs

View check run for this annotation

Codecov / codecov/patch

src/common.rs#L26

Added line #L26 was not covered by tests
fn default() -> Self {
Self {
verbose: Verbosity::new(0, 0),
}
}
}

/// Helper to print the current memory resident set size via `tracing`.
pub fn trace_rss_now() {
let me = procfs::process::Process::myself().unwrap();
Expand Down
23 changes: 23 additions & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
pub mod common;
pub mod db;
pub mod seqvars;
pub mod sv;

use clap::{Args, Parser, Subcommand};
Expand Down Expand Up @@ -33,6 +34,8 @@ enum Commands {
Db(Db),
/// SV filtration related commands.
Sv(Sv),
/// Sequence variant related commands.
Seqvars(Seqvars),
}

/// Parsing of "db *" sub commands.
Expand Down Expand Up @@ -67,6 +70,21 @@ enum SvCommands {
Query(sv::query::Args),
}

/// Parsing of "seqvars *" sub commands.
#[derive(Debug, Args)]
#[command(args_conflicts_with_subcommands = true)]
struct Seqvars {
/// The sub command to run
#[command(subcommand)]
command: SeqvarsCommands,
}

/// Enum supporting the parsing of "sv *" sub commands.
#[derive(Debug, Subcommand)]
enum SeqvarsCommands {
Ingest(seqvars::ingest::Args),
}

fn main() -> Result<(), anyhow::Error> {
let cli = Cli::parse();

Expand Down Expand Up @@ -98,6 +116,11 @@ fn main() -> Result<(), anyhow::Error> {
db::to_bin::cli::run(&cli.common, args)?;
}
},
Commands::Seqvars(seqvars) => match &seqvars.command {
SeqvarsCommands::Ingest(args) => {
seqvars::ingest::run(&cli.common, args)?;

Check warning on line 121 in src/main.rs

View check run for this annotation

Codecov / codecov/patch

src/main.rs#L119-L121

Added lines #L119 - L121 were not covered by tests
}
},
Commands::Sv(sv) => match &sv.command {
SvCommands::Query(args) => {
sv::query::run(&cli.common, args)?;
Expand Down
71 changes: 71 additions & 0 deletions src/seqvars/ingest/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
//! Implementation of `seqvars ingest` subcommand.
use crate::common::{self, GenomeRelease};

/// Command line arguments for `seqvars ingest` subcommand.
#[derive(Debug, clap::Parser)]
#[command(author, version, about = "ingest sequence variant VCF", long_about = None)]
pub struct Args {
/// The assumed genome build.
#[clap(long)]
pub genomebuild: GenomeRelease,
/// Path to input file.
#[clap(long)]
pub path_in: String,
/// Path to output file.
#[clap(long)]
pub path_out: String,
}

/// Main entry point for `seqvars ingest` sub command.
pub fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), anyhow::Error> {
let before_anything = std::time::Instant::now();
tracing::info!("args_common = {:#?}", &args_common);
tracing::info!("args = {:#?}", &args);

common::trace_rss_now();

tracing::info!(
"All of `seqvars ingest` completed in {:?}",
before_anything.elapsed()

Check warning on line 30 in src/seqvars/ingest/mod.rs

View check run for this annotation

Codecov / codecov/patch

src/seqvars/ingest/mod.rs#L30

Added line #L30 was not covered by tests
);
Ok(())
}

#[cfg(test)]
mod test {
use rstest::rstest;

use crate::common::GenomeRelease;

macro_rules! set_snapshot_suffix {
($($expr:expr),*) => {
let mut settings = insta::Settings::clone_current();
settings.set_snapshot_suffix(format!($($expr,)*));
let _guard = settings.bind_to_scope();
}
}

#[rstest]
#[case("tests/seqvars/ingest/example_dragen.07.021.624.3.10.4.vcf")]
#[case("tests/seqvars/ingest/example_dragen.07.021.624.3.10.9.vcf")]
#[case("tests/seqvars/ingest/example_gatk_hc.3.7-0.vcf")]
#[case("tests/seqvars/ingest/example_gatk_hc.4.4.0.0.vcf")]
fn smoke_test_run(#[case] path: &str) {
set_snapshot_suffix!("{:?}", path.split('/').last().unwrap().replace(".", "_"));

let tmpdir = temp_testdir::TempDir::default();

let args_common = Default::default();
let args = super::Args {
genomebuild: GenomeRelease::Grch37,
path_in: path.into(),
path_out: tmpdir
.join("out.vcf")
.to_str()
.expect("invalid path")
.into(),
};
super::run(&args_common, &args).unwrap();
}
}
1 change: 1 addition & 0 deletions src/seqvars/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pub mod ingest;
120 changes: 120 additions & 0 deletions tests/seqvars/ingest/example_dragen.07.021.624.3.10.4.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
##fileformat=VCFv4.2
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths (counting only informative reads out of the total reads) for the ref and alt alleles in the order listed">
##FORMAT=<ID=AF,Number=A,Type=Float,Description="Allele fractions for alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=F1R2,Number=R,Type=Integer,Description="Count of reads in F1R2 pair orientation supporting each allele">
##FORMAT=<ID=F2R1,Number=R,Type=Integer,Description="Count of reads in F2R1 pair orientation supporting each allele">
##FORMAT=<ID=GP,Number=G,Type=Float,Description="Phred-scaled posterior probabilities for genotypes as defined in the VCF specification">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=MB,Number=4,Type=Integer,Description="Per-sample component statistics to detect mate bias">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##FORMAT=<ID=PRI,Number=G,Type=Float,Description="Phred-scaled prior probabilities for genotypes">
##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias">
##FORMAT=<ID=SQ,Number=A,Type=Float,Description="Somatic quality">
##DRAGENCommandLine=<ID=HashTableBuild,Version="SW: 01.003.044.3.10.4, HashTableVersion: 8",CommandLineOptions="dragen --build-hash-table true --ht-reference /staging/human/reference/hs37d5/hs37d5.fa --output-dir /staging/human/reference/hs37d5/hs37d5.fa.k_21.f_16.m_149 --ht-num-threads 32 --ht-seed-len 21 --enable-cnv true">
##DRAGENCommandLine=<ID=dragen,Version="SW: 07.021.624.3.10.4, HW: 07.021.624",Date="Mon Mar 21 19:12:18 CET 2022",CommandLineOptions="--ref-dir /staging/human/reference/hs37d5/hs37d5.fa.k_21.f_16.m_149 --fastq-file1 /mnt/smb01-hum/NGSRawData/220318_A01077_0174_AH7JGVDMXY/Data/Intensities/BaseCalls/NA-12878WGS-Genom-size_S1_R1_001.fastq.gz --fastq-file2 /mnt/smb01-hum/NGSRawData/220318_A01077_0174_AH7JGVDMXY/Data/Intensities/BaseCalls/NA-12878WGS-Genom-size_S1_R2_001.fastq.gz --output-directory /staging/output/220318_A01077_0174_AH7JGVDMXY/NA-12878WGSWGS/ --output-file-prefix NA-12878WGSWGS_dragen --RGID WGS --RGSM NA-12878WGSWGS --num-threads 46 --enable-map-align true --enable-map-align-output true --enable-duplicate-marking true --enable-variant-caller true --qc-cross-cont-vcf /opt/edico/config/sample_cross_contamination_resource_GRCh37.vcf.gz --enable-cnv true --cnv-enable-self-normalization true --enable-sv true --qc-coverage-region-1 /staging/human/bed/CDS-v19-ROIs_v2.bed --qc-coverage-reports-1 cov_report full_res --qc-coverage-region-2 /staging/human/bed/Regions_Exomev8.bed --qc-coverage-reports-2 cov_report full_res --qc-coverage-region-3 /staging/human/bed/Padded_Exomev8.bed --qc-coverage-reports-3 cov_report full_res">
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP Membership">
##INFO=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias">
##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth">
##INFO=<ID=SOR,Number=1,Type=Float,Description="Symmetric Odds Ratio of 2x2 contingency table to detect strand bias">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (informative and non-informative); some reads may have been filtered based on mapq etc.">
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
##INFO=<ID=FractionInformativeReads,Number=1,Type=Float,Description="The fraction of informative reads out of the total reads">
##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities">
##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">
##contig=<ID=1,length=249250621>
##contig=<ID=2,length=243199373>
##contig=<ID=3,length=198022430>
##contig=<ID=4,length=191154276>
##contig=<ID=5,length=180915260>
##contig=<ID=6,length=171115067>
##contig=<ID=7,length=159138663>
##contig=<ID=8,length=146364022>
##contig=<ID=9,length=141213431>
##contig=<ID=10,length=135534747>
##contig=<ID=11,length=135006516>
##contig=<ID=12,length=133851895>
##contig=<ID=13,length=115169878>
##contig=<ID=14,length=107349540>
##contig=<ID=15,length=102531392>
##contig=<ID=16,length=90354753>
##contig=<ID=17,length=81195210>
##contig=<ID=18,length=78077248>
##contig=<ID=19,length=59128983>
##contig=<ID=20,length=63025520>
##contig=<ID=21,length=48129895>
##contig=<ID=22,length=51304566>
##contig=<ID=X,length=155270560>
##contig=<ID=Y,length=59373566>
##contig=<ID=MT,length=16569>
##contig=<ID=GL000207.1,length=4262>
##contig=<ID=GL000226.1,length=15008>
##contig=<ID=GL000229.1,length=19913>
##contig=<ID=GL000231.1,length=27386>
##contig=<ID=GL000210.1,length=27682>
##contig=<ID=GL000239.1,length=33824>
##contig=<ID=GL000235.1,length=34474>
##contig=<ID=GL000201.1,length=36148>
##contig=<ID=GL000247.1,length=36422>
##contig=<ID=GL000245.1,length=36651>
##contig=<ID=GL000197.1,length=37175>
##contig=<ID=GL000203.1,length=37498>
##contig=<ID=GL000246.1,length=38154>
##contig=<ID=GL000249.1,length=38502>
##contig=<ID=GL000196.1,length=38914>
##contig=<ID=GL000248.1,length=39786>
##contig=<ID=GL000244.1,length=39929>
##contig=<ID=GL000238.1,length=39939>
##contig=<ID=GL000202.1,length=40103>
##contig=<ID=GL000234.1,length=40531>
##contig=<ID=GL000232.1,length=40652>
##contig=<ID=GL000206.1,length=41001>
##contig=<ID=GL000240.1,length=41933>
##contig=<ID=GL000236.1,length=41934>
##contig=<ID=GL000241.1,length=42152>
##contig=<ID=GL000243.1,length=43341>
##contig=<ID=GL000242.1,length=43523>
##contig=<ID=GL000230.1,length=43691>
##contig=<ID=GL000237.1,length=45867>
##contig=<ID=GL000233.1,length=45941>
##contig=<ID=GL000204.1,length=81310>
##contig=<ID=GL000198.1,length=90085>
##contig=<ID=GL000208.1,length=92689>
##contig=<ID=GL000191.1,length=106433>
##contig=<ID=GL000227.1,length=128374>
##contig=<ID=GL000228.1,length=129120>
##contig=<ID=GL000214.1,length=137718>
##contig=<ID=GL000221.1,length=155397>
##contig=<ID=GL000209.1,length=159169>
##contig=<ID=GL000218.1,length=161147>
##contig=<ID=GL000220.1,length=161802>
##contig=<ID=GL000213.1,length=164239>
##contig=<ID=GL000211.1,length=166566>
##contig=<ID=GL000199.1,length=169874>
##contig=<ID=GL000217.1,length=172149>
##contig=<ID=GL000216.1,length=172294>
##contig=<ID=GL000215.1,length=172545>
##contig=<ID=GL000205.1,length=174588>
##contig=<ID=GL000219.1,length=179198>
##contig=<ID=GL000224.1,length=179693>
##contig=<ID=GL000223.1,length=180455>
##contig=<ID=GL000195.1,length=182896>
##contig=<ID=GL000212.1,length=186858>
##contig=<ID=GL000222.1,length=186861>
##contig=<ID=GL000200.1,length=187035>
##contig=<ID=GL000193.1,length=189789>
##contig=<ID=GL000194.1,length=191469>
##contig=<ID=GL000225.1,length=211173>
##contig=<ID=GL000192.1,length=547496>
##contig=<ID=NC_007605,length=171823>
##contig=<ID=hs37d5,length=35477943>
##reference=file:///staging/human/reference/hs37d5/hs37d5.fa.k_21.f_16.m_149/reference.bin
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878
17 41256074 . CA C 44.34 . AC=1;AF=0.500;AN=2;DP=18;FS=0.000;MQ=243.16;MQRankSum=3.266;QD=2.46;ReadPosRankSum=2.663;SOR=1.112;FractionInformativeReads=0.944 GT:AD:AF:DP:F1R2:F2R1:GQ:PL:GP:PRI:SB:MB:PS 0|1:11,6:0.353:17:8,3:3,3:43:55,0,47:4.4342e+01,2.0325e-04,5.0000e+01:0.00,11.00,14.01:5,6,2,4:4,7,5,1:41256074
MT 750 . A G . . DP=7835;MQ=167.68;FractionInformativeReads=0.998 GT:SQ:AD:AF:F1R2:F2R1:DP:SB:MB 1/1:98.13:1,7818:1.000:1,3815:0,4003:7819:1,0,4126,3692:1,0,3979,3839
Loading

0 comments on commit dc84e73

Please sign in to comment.