Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: expose strucvars annotation for worker aggregation command #204

Merged
merged 1 commit into from
Oct 9, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 64 additions & 34 deletions src/annotate/strucvars/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1627,11 +1627,7 @@ impl SvCaller {
}

/// Guess the `SvCaller` from the VCF file at the given path.
pub fn guess_sv_caller<P>(p: P) -> Result<SvCaller, anyhow::Error>
where
P: AsRef<Path>,
{
let reader = open_read_maybe_gz(p)?;
pub fn guess_sv_caller(reader: Box<dyn std::io::BufRead>) -> Result<SvCaller, anyhow::Error> {
let mut reader = noodles_vcf::reader::Builder.build_from_reader(reader)?;
let header = reader.read_header()?;
let mut records = reader.records(&header);
Expand Down Expand Up @@ -2643,12 +2639,14 @@ pub fn build_vcf_record_converter<T: AsRef<str>>(
}
}

/// Convert the records in the VCF path `path_input` to the JSONL file per contig in `tmp_dir`.
/// Convert the records in the VCF reader to the JSONL file per contig in `tmp_dir`.
///
/// Note that we will consider the "25 canonical" contigs only (chr1..chr22, chrX, chrY, chrM).
fn run_vcf_to_jsonl(
pub fn run_vcf_to_jsonl(
pedigree: &PedigreeByName,
path_input: &str,
reader: &mut noodles_vcf::Reader<Box<dyn std::io::BufRead>>,
header: &VcfHeader,
sv_caller: &SvCaller,
tmp_dir: &TempDir,
cov_readers: &mut HashMap<String, maelstrom::Reader>,
rng: &mut StdRng,
Expand All @@ -2671,24 +2669,17 @@ fn run_vcf_to_jsonl(
files
};

tracing::debug!("processing VCF file {}", path_input);
let sv_caller = guess_sv_caller(path_input)?;
tracing::debug!("guessed caller/version to be {:?}", &sv_caller);

let mut reader = noodles_vcf::reader::Builder.build_from_path(path_input)?;
let header: VcfHeader = reader.read_header()?;

let samples = header
.sample_names()
.iter()
.map(|s| s.to_string())
.collect::<Vec<_>>();
let converter = build_vcf_record_converter(&sv_caller, &samples);
let converter = build_vcf_record_converter(sv_caller, &samples);

let mapping = CHROM_TO_CHROM_NO.deref();
let mut uuid_buf = [0u8; 16];

for record in reader.records(&header) {
for record in reader.records(header) {
rng.fill_bytes(&mut uuid_buf);
let uuid = Uuid::from_bytes(uuid_buf);

Expand Down Expand Up @@ -2746,15 +2737,19 @@ fn annotate_cov_mq(
///
/// * `tmp_dir`: The temporary directory containing the JSONL files.
/// * `contig_no`: The contig number (1..25).
/// * `args`: The command line arguments.
/// * `slack_ins`: Slack around INS
/// * `slack_bnd`: Slack around BND
/// * `min_overlap`: Minimal reciprocal overlap
///
/// # Returns
///
/// The clustered records for the contig.
fn read_and_cluster_for_contig(
pub fn read_and_cluster_for_contig(
tmp_dir: &TempDir,
contig_no: usize,
args: &Args,
slack_ins: i32,
slack_bnd: i32,
min_overlap: f32,
) -> Result<Vec<VarFishStrucvarTsvRecord>, anyhow::Error> {
let jsonl_path = tmp_dir.path().join(format!("chrom-{}.jsonl", contig_no));
tracing::debug!("clustering files from {}", jsonl_path.display());
Expand All @@ -2773,8 +2768,8 @@ fn read_and_cluster_for_contig(
match jsonl::read::<_, VarFishStrucvarTsvRecord>(&mut reader) {
Ok(record) => {
let slack = match record.sv_type {
SvType::Ins => args.slack_ins,
SvType::Bnd => args.slack_bnd,
SvType::Ins => slack_ins,
SvType::Bnd => slack_bnd,
_ => 0,
};
let query = match record.sv_type {
Expand All @@ -2800,7 +2795,7 @@ fn read_and_cluster_for_contig(
_ => {
let ovl = record.overlap(&records[*record_id]);
assert!(ovl >= 0f32);
ovl >= args.min_overlap
ovl >= min_overlap
}
};
let match_this_sv_type = record.sv_type == records[*record_id].sv_type;
Expand Down Expand Up @@ -2891,7 +2886,22 @@ fn run_with_writer(
// Read through input VCF files and write out to temporary files.
tracing::info!("Input VCF files to temporary files...");
for path_input in args.path_input_vcf.iter() {
run_vcf_to_jsonl(pedigree, path_input, &tmp_dir, &mut cov_readers, &mut rng)?;
tracing::debug!("processing VCF file {}", path_input);
let reader = open_read_maybe_gz(path_input)?;
let sv_caller = guess_sv_caller(reader)?;
tracing::debug!("guessed caller/version to be {:?}", &sv_caller);

let mut reader = noodles_vcf::reader::Builder.build_from_path(path_input)?;
let header: VcfHeader = reader.read_header()?;
run_vcf_to_jsonl(
pedigree,
&mut reader,
&header,
&sv_caller,
&tmp_dir,
&mut cov_readers,
&mut rng,
)?;
}
tracing::info!("... done converting input files.");

Expand All @@ -2916,7 +2926,13 @@ fn run_with_writer(
// Read through temporary files by contig, cluster by overlap as configured, and write to `writer`.
for contig_no in 1..=25 {
tracing::info!(" contig: {}", CANONICAL[contig_no - 1]);
let clusters = read_and_cluster_for_contig(&tmp_dir, contig_no, args)?;
let clusters = read_and_cluster_for_contig(
&tmp_dir,
contig_no,
args.slack_ins,
args.slack_bnd,
args.min_overlap,
)?;
for record in clusters {
writer.write_record(&header_out, &record.try_into()?)?;
}
Expand Down Expand Up @@ -3135,7 +3151,7 @@ mod test {
VarFishStrucvarTsvRecord,
},
},
common::GenomeRelease,
common::{open_read_maybe_gz, GenomeRelease},
ped::{Disease, Individual, PedigreeByName, Sex},
};

Expand Down Expand Up @@ -3376,7 +3392,7 @@ mod test {
let path_expected_txt = format!("tests/data/annotate/strucvars/{}-min.out.jsonl", key);
let samples = vcf_samples(&path_input_vcf)?;
let pedigree: PedigreeByName = sample_ped(&samples);
let sv_caller = guess_sv_caller(&path_input_vcf)?;
let sv_caller = guess_sv_caller(open_read_maybe_gz(&path_input_vcf)?)?;
let converter = build_vcf_record_converter(&sv_caller, &samples);

run_test_vcf_to_jsonl(
Expand All @@ -3392,55 +3408,69 @@ mod test {

#[test]
fn guess_sv_caller_delly() -> Result<(), anyhow::Error> {
let sv_caller = guess_sv_caller("tests/data/annotate/strucvars/delly2-min.vcf")?;
let sv_caller = guess_sv_caller(open_read_maybe_gz(
"tests/data/annotate/strucvars/delly2-min.vcf",
)?)?;
insta::assert_debug_snapshot!(sv_caller);

Ok(())
}

#[test]
fn guess_sv_caller_dragen_sv() -> Result<(), anyhow::Error> {
let sv_caller = guess_sv_caller("tests/data/annotate/strucvars/dragen-sv-min.vcf")?;
let sv_caller = guess_sv_caller(open_read_maybe_gz(
"tests/data/annotate/strucvars/dragen-sv-min.vcf",
)?)?;
insta::assert_debug_snapshot!(sv_caller);

Ok(())
}

#[test]
fn guess_sv_caller_dragen_cnv() -> Result<(), anyhow::Error> {
let sv_caller = guess_sv_caller("tests/data/annotate/strucvars/dragen-cnv-min.vcf")?;
let sv_caller = guess_sv_caller(open_read_maybe_gz(
"tests/data/annotate/strucvars/dragen-cnv-min.vcf",
)?)?;
insta::assert_debug_snapshot!(sv_caller);

Ok(())
}

#[test]
fn guess_sv_caller_gcnv() -> Result<(), anyhow::Error> {
let sv_caller = guess_sv_caller("tests/data/annotate/strucvars/gcnv-min.vcf")?;
let sv_caller = guess_sv_caller(open_read_maybe_gz(
"tests/data/annotate/strucvars/gcnv-min.vcf",
)?)?;
insta::assert_debug_snapshot!(sv_caller);

Ok(())
}

#[test]
fn guess_sv_caller_manta() -> Result<(), anyhow::Error> {
let sv_caller = guess_sv_caller("tests/data/annotate/strucvars/manta-min.vcf")?;
let sv_caller = guess_sv_caller(open_read_maybe_gz(
"tests/data/annotate/strucvars/manta-min.vcf",
)?)?;
insta::assert_debug_snapshot!(sv_caller);

Ok(())
}

#[test]
fn guess_sv_caller_melt() -> Result<(), anyhow::Error> {
let sv_caller = guess_sv_caller("tests/data/annotate/strucvars/melt-min.vcf")?;
let sv_caller = guess_sv_caller(open_read_maybe_gz(
"tests/data/annotate/strucvars/melt-min.vcf",
)?)?;
insta::assert_debug_snapshot!(sv_caller);

Ok(())
}

#[test]
fn guess_sv_caller_popdel() -> Result<(), anyhow::Error> {
let sv_caller = guess_sv_caller("tests/data/annotate/strucvars/popdel-min.vcf")?;
let sv_caller = guess_sv_caller(open_read_maybe_gz(
"tests/data/annotate/strucvars/popdel-min.vcf",
)?)?;
insta::assert_debug_snapshot!(sv_caller);

Ok(())
Expand Down