diff --git a/src/annotate/strucvars/mod.rs b/src/annotate/strucvars/mod.rs
index 902e9949..13c5a2d8 100644
--- a/src/annotate/strucvars/mod.rs
+++ b/src/annotate/strucvars/mod.rs
@@ -1627,11 +1627,7 @@ impl SvCaller {
}
/// Guess the `SvCaller` from the VCF file at the given path.
-pub fn guess_sv_caller
(p: P) -> Result
-where
- P: AsRef,
-{
- let reader = open_read_maybe_gz(p)?;
+pub fn guess_sv_caller(reader: Box) -> Result {
let mut reader = noodles_vcf::reader::Builder.build_from_reader(reader)?;
let header = reader.read_header()?;
let mut records = reader.records(&header);
@@ -2643,12 +2639,14 @@ pub fn build_vcf_record_converter>(
}
}
-/// Convert the records in the VCF path `path_input` to the JSONL file per contig in `tmp_dir`.
+/// Convert the records in the VCF reader to the JSONL file per contig in `tmp_dir`.
///
/// Note that we will consider the "25 canonical" contigs only (chr1..chr22, chrX, chrY, chrM).
-fn run_vcf_to_jsonl(
+pub fn run_vcf_to_jsonl(
pedigree: &PedigreeByName,
- path_input: &str,
+ reader: &mut noodles_vcf::Reader>,
+ header: &VcfHeader,
+ sv_caller: &SvCaller,
tmp_dir: &TempDir,
cov_readers: &mut HashMap,
rng: &mut StdRng,
@@ -2671,24 +2669,17 @@ fn run_vcf_to_jsonl(
files
};
- tracing::debug!("processing VCF file {}", path_input);
- let sv_caller = guess_sv_caller(path_input)?;
- tracing::debug!("guessed caller/version to be {:?}", &sv_caller);
-
- let mut reader = noodles_vcf::reader::Builder.build_from_path(path_input)?;
- let header: VcfHeader = reader.read_header()?;
-
let samples = header
.sample_names()
.iter()
.map(|s| s.to_string())
.collect::>();
- let converter = build_vcf_record_converter(&sv_caller, &samples);
+ let converter = build_vcf_record_converter(sv_caller, &samples);
let mapping = CHROM_TO_CHROM_NO.deref();
let mut uuid_buf = [0u8; 16];
- for record in reader.records(&header) {
+ for record in reader.records(header) {
rng.fill_bytes(&mut uuid_buf);
let uuid = Uuid::from_bytes(uuid_buf);
@@ -2746,15 +2737,19 @@ fn annotate_cov_mq(
///
/// * `tmp_dir`: The temporary directory containing the JSONL files.
/// * `contig_no`: The contig number (1..25).
-/// * `args`: The command line arguments.
+/// * `slack_ins`: Slack around INS
+/// * `slack_bnd`: Slack around BND
+/// * `min_overlap`: Minimal reciprocal overlap
///
/// # Returns
///
/// The clustered records for the contig.
-fn read_and_cluster_for_contig(
+pub fn read_and_cluster_for_contig(
tmp_dir: &TempDir,
contig_no: usize,
- args: &Args,
+ slack_ins: i32,
+ slack_bnd: i32,
+ min_overlap: f32,
) -> Result, anyhow::Error> {
let jsonl_path = tmp_dir.path().join(format!("chrom-{}.jsonl", contig_no));
tracing::debug!("clustering files from {}", jsonl_path.display());
@@ -2773,8 +2768,8 @@ fn read_and_cluster_for_contig(
match jsonl::read::<_, VarFishStrucvarTsvRecord>(&mut reader) {
Ok(record) => {
let slack = match record.sv_type {
- SvType::Ins => args.slack_ins,
- SvType::Bnd => args.slack_bnd,
+ SvType::Ins => slack_ins,
+ SvType::Bnd => slack_bnd,
_ => 0,
};
let query = match record.sv_type {
@@ -2800,7 +2795,7 @@ fn read_and_cluster_for_contig(
_ => {
let ovl = record.overlap(&records[*record_id]);
assert!(ovl >= 0f32);
- ovl >= args.min_overlap
+ ovl >= min_overlap
}
};
let match_this_sv_type = record.sv_type == records[*record_id].sv_type;
@@ -2891,7 +2886,22 @@ fn run_with_writer(
// Read through input VCF files and write out to temporary files.
tracing::info!("Input VCF files to temporary files...");
for path_input in args.path_input_vcf.iter() {
- run_vcf_to_jsonl(pedigree, path_input, &tmp_dir, &mut cov_readers, &mut rng)?;
+ tracing::debug!("processing VCF file {}", path_input);
+ let reader = open_read_maybe_gz(path_input)?;
+ let sv_caller = guess_sv_caller(reader)?;
+ tracing::debug!("guessed caller/version to be {:?}", &sv_caller);
+
+ let mut reader = noodles_vcf::reader::Builder.build_from_path(path_input)?;
+ let header: VcfHeader = reader.read_header()?;
+ run_vcf_to_jsonl(
+ pedigree,
+ &mut reader,
+ &header,
+ &sv_caller,
+ &tmp_dir,
+ &mut cov_readers,
+ &mut rng,
+ )?;
}
tracing::info!("... done converting input files.");
@@ -2916,7 +2926,13 @@ fn run_with_writer(
// Read through temporary files by contig, cluster by overlap as configured, and write to `writer`.
for contig_no in 1..=25 {
tracing::info!(" contig: {}", CANONICAL[contig_no - 1]);
- let clusters = read_and_cluster_for_contig(&tmp_dir, contig_no, args)?;
+ let clusters = read_and_cluster_for_contig(
+ &tmp_dir,
+ contig_no,
+ args.slack_ins,
+ args.slack_bnd,
+ args.min_overlap,
+ )?;
for record in clusters {
writer.write_record(&header_out, &record.try_into()?)?;
}
@@ -3135,7 +3151,7 @@ mod test {
VarFishStrucvarTsvRecord,
},
},
- common::GenomeRelease,
+ common::{open_read_maybe_gz, GenomeRelease},
ped::{Disease, Individual, PedigreeByName, Sex},
};
@@ -3376,7 +3392,7 @@ mod test {
let path_expected_txt = format!("tests/data/annotate/strucvars/{}-min.out.jsonl", key);
let samples = vcf_samples(&path_input_vcf)?;
let pedigree: PedigreeByName = sample_ped(&samples);
- let sv_caller = guess_sv_caller(&path_input_vcf)?;
+ let sv_caller = guess_sv_caller(open_read_maybe_gz(&path_input_vcf)?)?;
let converter = build_vcf_record_converter(&sv_caller, &samples);
run_test_vcf_to_jsonl(
@@ -3392,7 +3408,9 @@ mod test {
#[test]
fn guess_sv_caller_delly() -> Result<(), anyhow::Error> {
- let sv_caller = guess_sv_caller("tests/data/annotate/strucvars/delly2-min.vcf")?;
+ let sv_caller = guess_sv_caller(open_read_maybe_gz(
+ "tests/data/annotate/strucvars/delly2-min.vcf",
+ )?)?;
insta::assert_debug_snapshot!(sv_caller);
Ok(())
@@ -3400,7 +3418,9 @@ mod test {
#[test]
fn guess_sv_caller_dragen_sv() -> Result<(), anyhow::Error> {
- let sv_caller = guess_sv_caller("tests/data/annotate/strucvars/dragen-sv-min.vcf")?;
+ let sv_caller = guess_sv_caller(open_read_maybe_gz(
+ "tests/data/annotate/strucvars/dragen-sv-min.vcf",
+ )?)?;
insta::assert_debug_snapshot!(sv_caller);
Ok(())
@@ -3408,7 +3428,9 @@ mod test {
#[test]
fn guess_sv_caller_dragen_cnv() -> Result<(), anyhow::Error> {
- let sv_caller = guess_sv_caller("tests/data/annotate/strucvars/dragen-cnv-min.vcf")?;
+ let sv_caller = guess_sv_caller(open_read_maybe_gz(
+ "tests/data/annotate/strucvars/dragen-cnv-min.vcf",
+ )?)?;
insta::assert_debug_snapshot!(sv_caller);
Ok(())
@@ -3416,7 +3438,9 @@ mod test {
#[test]
fn guess_sv_caller_gcnv() -> Result<(), anyhow::Error> {
- let sv_caller = guess_sv_caller("tests/data/annotate/strucvars/gcnv-min.vcf")?;
+ let sv_caller = guess_sv_caller(open_read_maybe_gz(
+ "tests/data/annotate/strucvars/gcnv-min.vcf",
+ )?)?;
insta::assert_debug_snapshot!(sv_caller);
Ok(())
@@ -3424,7 +3448,9 @@ mod test {
#[test]
fn guess_sv_caller_manta() -> Result<(), anyhow::Error> {
- let sv_caller = guess_sv_caller("tests/data/annotate/strucvars/manta-min.vcf")?;
+ let sv_caller = guess_sv_caller(open_read_maybe_gz(
+ "tests/data/annotate/strucvars/manta-min.vcf",
+ )?)?;
insta::assert_debug_snapshot!(sv_caller);
Ok(())
@@ -3432,7 +3458,9 @@ mod test {
#[test]
fn guess_sv_caller_melt() -> Result<(), anyhow::Error> {
- let sv_caller = guess_sv_caller("tests/data/annotate/strucvars/melt-min.vcf")?;
+ let sv_caller = guess_sv_caller(open_read_maybe_gz(
+ "tests/data/annotate/strucvars/melt-min.vcf",
+ )?)?;
insta::assert_debug_snapshot!(sv_caller);
Ok(())
@@ -3440,7 +3468,9 @@ mod test {
#[test]
fn guess_sv_caller_popdel() -> Result<(), anyhow::Error> {
- let sv_caller = guess_sv_caller("tests/data/annotate/strucvars/popdel-min.vcf")?;
+ let sv_caller = guess_sv_caller(open_read_maybe_gz(
+ "tests/data/annotate/strucvars/popdel-min.vcf",
+ )?)?;
insta::assert_debug_snapshot!(sv_caller);
Ok(())