From 65d12a46d427694a10cd6da6b2b3d35d02baa2b8 Mon Sep 17 00:00:00 2001 From: Manuel Holtgrewe Date: Mon, 17 Apr 2023 09:19:40 +0200 Subject: [PATCH] feat: annotation of structural variants (#46) --- src/annotate/mod.rs | 1 + src/annotate/seqvars/binning.rs | 30 ++++++ src/annotate/seqvars/mod.rs | 84 ++++++--------- src/annotate/strucvars/mod.rs | 183 ++++++++++++++++++++++++++++++++ 4 files changed, 245 insertions(+), 53 deletions(-) create mode 100644 src/annotate/seqvars/binning.rs create mode 100644 src/annotate/strucvars/mod.rs diff --git a/src/annotate/mod.rs b/src/annotate/mod.rs index 26e4df0b..756fb5ea 100644 --- a/src/annotate/mod.rs +++ b/src/annotate/mod.rs @@ -1,3 +1,4 @@ //! Annotation of VCF files. pub mod seqvars; +pub mod strucvars; diff --git a/src/annotate/seqvars/binning.rs b/src/annotate/seqvars/binning.rs new file mode 100644 index 00000000..a8a76c22 --- /dev/null +++ b/src/annotate/seqvars/binning.rs @@ -0,0 +1,30 @@ +//! UCSC binning scheme. + +// Offsets for UCSC binning. +// +// cf. http://genomewiki.ucsc.edu/index.php/Bin_indexing_system +static BIN_OFFSETS: &[i32] = &[512 + 64 + 8 + 1, 64 + 8 + 1, 8 + 1, 1, 0]; + +const BIN_FIRST_SHIFT: i32 = 17; + +const BIN_NEXT_SHIFT: i32 = 3; + +// Compute UCSC bin from 0-based half-open interval. +pub fn bin_from_range(begin: i32, end: i32) -> Result { + let mut begin_bin = begin >> BIN_FIRST_SHIFT; + let mut end_bin = std::cmp::max(begin, end - 1) >> BIN_FIRST_SHIFT; + + for offset in BIN_OFFSETS { + if begin_bin == end_bin { + return Ok(offset + begin_bin); + } + begin_bin >>= BIN_NEXT_SHIFT; + end_bin >>= BIN_NEXT_SHIFT; + } + + anyhow::bail!( + "begin {}, end {} out of range in bin_from_range (max is 512M", + begin, + end + ); +} diff --git a/src/annotate/seqvars/mod.rs b/src/annotate/seqvars/mod.rs index 1f7b33a1..a80c5155 100644 --- a/src/annotate/seqvars/mod.rs +++ b/src/annotate/seqvars/mod.rs @@ -1,6 +1,7 @@ //! Annotation of sequence variants. pub mod ann; +pub mod binning; pub mod csq; pub mod provider; @@ -91,7 +92,7 @@ pub struct Args { pub genome_release: Option, /// Path to the input PED file. #[arg(long)] - pub path_input_ped: String, + pub path_input_ped: Option, /// Path to the input VCF file. #[arg(long)] pub path_input_vcf: String, @@ -116,7 +117,7 @@ pub struct PathOutput { pub path_output_vcf: Option, /// Path to the output TSV file (for import into VarFish). - #[arg(long)] + #[arg(long, requires = "path-input-ped")] pub path_output_tsv: Option, } @@ -674,8 +675,8 @@ impl AnnotatedVcfWriter for VcfWriter { } } -/// Writing of VarFish TSV files. -struct VarFishTsvWriter { +/// Writing of sequence variants to VarFish TSV files. +struct VarFishSeqvarTsvWriter { inner: Box, assembly: Option, pedigree: Option, @@ -683,33 +684,6 @@ struct VarFishTsvWriter { hgnc_map: Option>, } -// Offsets for UCSC binning. -// -// cf. http://genomewiki.ucsc.edu/index.php/Bin_indexing_system -static BIN_OFFSETS: &[i32] = &[512 + 64 + 8 + 1, 64 + 8 + 1, 8 + 1, 1, 0]; -const BIN_FIRST_SHIFT: i32 = 17; -const BIN_NEXT_SHIFT: i32 = 3; - -// Compute UCSC bin from 0-based half-open interval. -pub fn bin_from_range(begin: i32, end: i32) -> Result { - let mut begin_bin = begin >> BIN_FIRST_SHIFT; - let mut end_bin = std::cmp::max(begin, end - 1) >> BIN_FIRST_SHIFT; - - for offset in BIN_OFFSETS { - if begin_bin == end_bin { - return Ok(offset + begin_bin); - } - begin_bin >>= BIN_NEXT_SHIFT; - end_bin >>= BIN_NEXT_SHIFT; - } - - anyhow::bail!( - "begin {}, end {} out of range in bin_from_range (max is 512M", - begin, - end - ); -} - /// Entry with genotype (`gt`), coverage (`dp`), allele depth (`ad`) and /// genotype quality (`gq`). #[derive(Debug, Default)] @@ -778,7 +752,7 @@ impl GenotypeCalls { } } -impl VarFishTsvWriter { +impl VarFishSeqvarTsvWriter { // Create new TSV writer from path. pub fn with_path

(p: P) -> Self where @@ -811,7 +785,7 @@ impl VarFishTsvWriter { &self, assembly: Assembly, record: &VcfRecord, - tsv_record: &mut VarFishTsvRecord, + tsv_record: &mut VarFishSeqvarTsvRecord, ) -> Result { tsv_record.release = match assembly { Assembly::Grch37 | Assembly::Grch37p10 => String::from("GRCh37"), @@ -844,7 +818,8 @@ impl VarFishTsvWriter { tsv_record.start = record.position().into(); tsv_record.end = tsv_record.start + tsv_record.reference.len() - 1; - tsv_record.bin = bin_from_range(tsv_record.start as i32 - 1, tsv_record.end as i32)? as u32; + tsv_record.bin = + binning::bin_from_range(tsv_record.start as i32 - 1, tsv_record.end as i32)? as u32; tsv_record.var_type = if tsv_record.reference.len() == tsv_record.alternative.len() { if tsv_record.reference.len() == 1 { @@ -863,7 +838,7 @@ impl VarFishTsvWriter { fn fill_genotype_and_freqs( &self, record: &VcfRecord, - tsv_record: &mut VarFishTsvRecord, + tsv_record: &mut VarFishSeqvarTsvRecord, ) -> Result<(), anyhow::Error> { // Extract genotype information. let hdr = self @@ -1014,7 +989,7 @@ impl VarFishTsvWriter { fn fill_bg_freqs( &self, record: &VcfRecord, - tsv_record: &mut VarFishTsvRecord, + tsv_record: &mut VarFishSeqvarTsvRecord, ) -> Result<(), anyhow::Error> { // Extract gnomAD frequencies. @@ -1117,7 +1092,7 @@ impl VarFishTsvWriter { fn expand_refseq_ensembl_and_write( &mut self, record: &VcfRecord, - tsv_record: &mut VarFishTsvRecord, + tsv_record: &mut VarFishSeqvarTsvRecord, ) -> Result<(), anyhow::Error> { if let Some(anns) = record .info() @@ -1240,7 +1215,7 @@ impl VarFishTsvWriter { fn fill_clinvar( &self, record: &VcfRecord, - tsv_record: &mut VarFishTsvRecord, + tsv_record: &mut VarFishSeqvarTsvRecord, ) -> Result<(), anyhow::Error> { tsv_record.in_clinvar = record .info() @@ -1261,7 +1236,7 @@ impl VarFishTsvWriter { /// A record, as written out to a VarFish TSV file. #[derive(Debug, Default)] -pub struct VarFishTsvRecord { +pub struct VarFishSeqvarTsvRecord { pub release: String, pub chromosome: String, pub chromosome_no: u32, @@ -1323,7 +1298,7 @@ pub struct VarFishTsvRecord { pub ensembl_exon_dist: Option, } -impl VarFishTsvRecord { +impl VarFishSeqvarTsvRecord { /// Clear the `refseq_*` and `ensembl_*` fields. pub fn clear_refseq_ensembl(&mut self) { self.refseq_gene_id = None; @@ -1438,7 +1413,7 @@ impl VarFishTsvRecord { } /// Implement `AnnotatedVcfWriter` for `VarFishTsvWriter`. -impl AnnotatedVcfWriter for VarFishTsvWriter { +impl AnnotatedVcfWriter for VarFishSeqvarTsvWriter { fn write_header(&mut self, header: &VcfHeader) -> Result<(), anyhow::Error> { self.header = Some(header.clone()); let header = &[ @@ -1497,7 +1472,7 @@ impl AnnotatedVcfWriter for VarFishTsvWriter { } fn write_record(&mut self, record: &VcfRecord) -> Result<(), anyhow::Error> { - let mut tsv_record = VarFishTsvRecord::default(); + let mut tsv_record = VarFishSeqvarTsvRecord::default(); if !self.fill_coords( self.assembly.expect("assembly must have been set"), @@ -1580,11 +1555,6 @@ fn run_with_writer(writer: &mut dyn AnnotatedVcfWriter, args: &Args) -> Result<( let cf_clinvar = db_clinvar.cf_handle("clinvar_seqvars").unwrap(); - // Load the pedigree. - tracing::info!("Loading pedigree..."); - writer.set_pedigree(&PedigreeByName::from_path(&args.path_input_ped)?); - tracing::info!("... done loading pedigree"); - // Open the serialized transcripts. tracing::info!("Opening transcript database"); let tx_db = load_tx_db( @@ -1707,6 +1677,14 @@ pub fn run(_common: &crate::common::Args, args: &Args) -> Result<(), anyhow::Err run_with_writer(&mut writer, args)?; } else { let mut writer = VcfWriter::new(File::create(path_output_vcf).map(BufWriter::new)?); + + // Load the pedigree. + tracing::info!("Loading pedigree..."); + writer.set_pedigree(&PedigreeByName::from_path( + &args.path_input_ped.as_ref().unwrap(), + )?); + tracing::info!("... done loading pedigree"); + run_with_writer(&mut writer, args)?; } } else { @@ -1734,7 +1712,7 @@ pub fn run(_common: &crate::common::Args, args: &Args) -> Result<(), anyhow::Err .path_output_tsv .as_ref() .expect("tsv path must be set; vcf and tsv are mutually exclusive, vcf unset"); - let mut writer = VarFishTsvWriter::with_path(path_output_tsv); + let mut writer = VarFishSeqvarTsvWriter::with_path(path_output_tsv); writer.set_hgnc_map(hgnc_map); run_with_writer(&mut writer, args)?; } @@ -1748,7 +1726,7 @@ mod test { use pretty_assertions::assert_eq; use temp_testdir::TempDir; - use crate::annotate::seqvars::bin_from_range; + use super::binning::bin_from_range; use super::{run, Args, PathOutput}; @@ -1772,9 +1750,9 @@ mod test { }, max_var_count: None, max_fb_tables: 5_000_000, - path_input_ped: String::from( + path_input_ped: Some(String::from( "tests/data/db/create/seqvar_freqs/db-rs1263393206/input.ped", - ), + )), }; run(&args_common, &args)?; @@ -1808,9 +1786,9 @@ mod test { }, max_var_count: None, max_fb_tables: 5_000_000, - path_input_ped: String::from( + path_input_ped: Some(String::from( "tests/data/db/create/seqvar_freqs/db-rs1263393206/input.ped", - ), + )), }; run(&args_common, &args)?; diff --git a/src/annotate/strucvars/mod.rs b/src/annotate/strucvars/mod.rs new file mode 100644 index 00000000..a382dfd4 --- /dev/null +++ b/src/annotate/strucvars/mod.rs @@ -0,0 +1,183 @@ +//! Annotation of structural variant VCF files. + +use std::io::Write; +use std::path::Path; +use std::{fs::File, io::BufWriter}; + +use crate::annotate::seqvars::HgncRecord; +use crate::common::GenomeRelease; +use crate::ped::PedigreeByName; +use clap::{Args as ClapArgs, Parser}; +use flate2::write::GzEncoder; +use flate2::Compression; +use hgvs::static_data::Assembly; +use noodles::bgzf::Writer as BgzfWriter; +use noodles::vcf::{Header as VcfHeader, Record as VcfRecord, Writer as VcfWriter}; +use rustc_hash::FxHashMap; + +use super::seqvars::AnnotatedVcfWriter; + +/// Command line arguments for `annotate strucvars` sub command. +#[derive(Parser, Debug)] +#[command(about = "Annotate structural variant VCF files", long_about = None)] +pub struct Args { + /// Path to the mehari database folder. + #[arg(long)] + pub path_db: String, + + /// Genome release to use, default is to auto-detect. + #[arg(long, value_enum)] + pub genome_release: Option, + /// Path to the input PED file. + #[arg(long)] + pub path_input_ped: Option, + /// Path to the input VCF file. + #[arg(long)] + pub path_input_vcf: String, + #[command(flatten)] + pub output: PathOutput, + + /// For debug purposes, maximal number of variants to annotate. + #[arg(long)] + pub max_var_count: Option, + /// Maximal number of flatbuffers tables, should not need tweaking. However, if you see a + /// "too many tables" error output, increase this value. + #[arg(long, default_value_t = 5_000_000)] + pub max_fb_tables: usize, +} + +/// Command line arguments to enforce either `--path-output-vcf` or `--path-output-tsv`. +#[derive(Debug, ClapArgs)] +#[group(required = true, multiple = false)] +pub struct PathOutput { + /// Path to the output VCF file. + #[arg(long)] + pub path_output_vcf: Option, + + /// Path to the output TSV file (for import into VarFish). + #[arg(long, requires = "path-input-ped")] + pub path_output_tsv: Option, +} + +/// Writing of structural variants to VarFish TSV files. +struct VarFishStrucvarTsvWriter { + inner: Box, + assembly: Option, + pedigree: Option, + header: Option, + hgnc_map: Option>, +} + +/// TODO XXX +#[derive(Debug, Default)] +struct GenotypeInfo { + pub name: String, + pub gt: Option, + // XXX +} + +#[derive(Debug, Default)] +struct GenotypeCalls { + pub entries: Vec, +} + +impl GenotypeCalls { + pub fn for_tsv(&self) -> String { + todo!() + } +} + +/// Implement `AnnotatedVcfWriter` for `VarFishTsvWriter`. +impl AnnotatedVcfWriter for VarFishStrucvarTsvWriter { + fn write_header(&mut self, header: &VcfHeader) -> Result<(), anyhow::Error> { + todo!() + } + + fn write_record(&mut self, record: &VcfRecord) -> Result<(), anyhow::Error> { + todo!() + } +} + +impl VarFishStrucvarTsvWriter { + // Create new TSV writer from path. + pub fn with_path

(p: P) -> Self + where + P: AsRef, + { + Self { + inner: if p.as_ref().extension().unwrap() == "gz" { + Box::new(GzEncoder::new( + File::create(p).unwrap(), + Compression::default(), + )) + } else { + Box::new(File::create(p).unwrap()) + }, + hgnc_map: None, + assembly: None, + pedigree: None, + header: None, + } + } +} + +/// Run the annotation with the given `Write` within the `VcfWriter`. +fn run_with_writer(writer: &mut dyn AnnotatedVcfWriter, args: &Args) -> Result<(), anyhow::Error> { + todo!() +} + +/// Main entry point for `annotate strucvars` sub command. +pub fn run(_common: &crate::common::Args, args: &Args) -> Result<(), anyhow::Error> { + tracing::info!("config = {:#?}", &args); + if let Some(path_output_vcf) = &args.output.path_output_vcf { + if path_output_vcf.ends_with(".vcf.gz") || path_output_vcf.ends_with(".vcf.bgzf") { + let mut writer = VcfWriter::new( + File::create(path_output_vcf) + .map(BufWriter::new) + .map(BgzfWriter::new)?, + ); + run_with_writer(&mut writer, args)?; + } else { + let mut writer = VcfWriter::new(File::create(path_output_vcf).map(BufWriter::new)?); + + // Load the pedigree. + tracing::info!("Loading pedigree..."); + writer.set_pedigree(&PedigreeByName::from_path( + &args.path_input_ped.as_ref().unwrap(), + )?); + tracing::info!("... done loading pedigree"); + + run_with_writer(&mut writer, args)?; + } + } else { + // Load the HGNC xlink map. + let hgnc_map = { + tracing::info!("Loading HGNC map ..."); + let mut result = FxHashMap::default(); + + let tsv_file = File::open(&format!("{}/hgnc.tsv", &args.path_db,))?; + let mut tsv_reader = csv::ReaderBuilder::new() + .comment(Some(b'#')) + .delimiter(b'\t') + .from_reader(tsv_file); + for record in tsv_reader.deserialize() { + let record: HgncRecord = record?; + result.insert(record.hgnc_id.clone(), record); + } + tracing::info!("... done loading HGNC map"); + + result + }; + + let path_output_tsv = args + .output + .path_output_tsv + .as_ref() + .expect("tsv path must be set; vcf and tsv are mutually exclusive, vcf unset"); + let mut writer = VarFishStrucvarTsvWriter::with_path(path_output_tsv); + writer.set_hgnc_map(hgnc_map); + run_with_writer(&mut writer, args)?; + } + + Ok(()) +}