diff --git a/.gitattributes b/.gitattributes index 6f6bd60..187f9f9 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,3 +1,5 @@ src/static_data/**/*.json* filter=lfs diff=lfs merge=lfs -text -tests/data/**.gz -tests/data/data/*.gz filter=lfs diff=lfs merge=lfs -text +tests/data/*.gz filter=lfs diff=lfs merge=lfs -text +tests/data/*/*.gz filter=lfs diff=lfs merge=lfs -text +tests/data/*.fasta filter=lfs diff=lfs merge=lfs -text +tests/data/*/*.fasta filter=lfs diff=lfs merge=lfs -text diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index f929f8f..33d9dea 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -31,6 +31,8 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v2 + with: + lfs: true - name: Install stable toolchain uses: actions-rs/toolchain@v1 @@ -69,12 +71,14 @@ jobs: - name: Checkout repository uses: actions/checkout@v2 with: - lfs: true + lfs: 'true' - name: Import test database. run: | + set -euo pipefail zcat tests/data/data/uta_20210129-subset.pgd.gz \ | psql -v ON_ERROR_STOP=1 -U uta_admin -h 0.0.0.0 -d uta + shell: bash env: PGPASSWORD: uta_admin @@ -94,6 +98,8 @@ jobs: env: TEST_UTA_DATABASE_URL: postgres://uta_admin:uta_admin@0.0.0.0/uta TEST_UTA_DATABASE_SCHEMA: uta_20210129 + TEST_SEQREPO_CACHE_MODE: read + TEST_SEQREPO_CACHE_PATH: tests/data/seqrepo_cache.fasta - name: Codecov uses: codecov/codecov-action@v3 diff --git a/.gitignore b/.gitignore index 891c9e4..1e28cf0 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ *~ .*.sw? +/.vscode diff --git a/Cargo.toml b/Cargo.toml index 3c158aa..4ef66af 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,14 +5,21 @@ edition = "2021" [dependencies] anyhow = "1.0.69" +bio = "1.1.0" chrono = "0.4.23" enum-map = "2.4.2" flate2 = "1.0.25" lazy_static = "1.4.0" linked-hash-map = "0.5.6" +log = "0.4.17" nom = "7.1.3" postgres = { version = "0.19.4", features = ["with-chrono-0_4"] } pretty_assertions = "1.3.0" regex = "1.7.1" +seqrepo = { git = "https://github.com/bihealth/seqrepo-rs.git", branch = "27-whole-fasta-sequences-are-read" } serde = { version = "1.0.152", features = ["derive"] } serde_json = "1.0.93" + +[dev-dependencies] +env_logger = "0.10.0" +test-log = "0.2.11" diff --git a/README.md b/README.md index 0628a0a..79fdf20 100644 --- a/README.md +++ b/README.md @@ -15,5 +15,47 @@ To use the public database: ``` export TEST_UTA_DATABASE_URL=postgres://anonymous:anonymous@uta.biocommons.org:/uta export TEST_UTA_DATABASE_SCHEMA=uta_20210129 -$ cargo test ``` + +Note that [seqrepo-rs](https://github.com/bihealth/seqrepo-rs) is used for access to the genome contig sequence. +It is inconvenient to provide sub sets of sequences in SeqRepo format. +Instead, we use a build-cache/read-cache approach that is also used by `biocommons/hgvs`. + +To build the cache, you will first need a download of the seqrepo [as described in biocommons/biocommons.seqrepo Quickstart](https://github.com/biocommons/biocommons.seqrepo#quick-start). +Then, you configure the running of tests for `hgvs-rs` as follows: + +``` +export TEST_SEQREPO_CACHE_MODE=write +export TEST_SEQREPO_PATH=path/to/seqrepo/instance +export TEST_SEQREPO_CACHE_PATH=tests/data/seqrepo_cache.fasta +``` + +When running the tests with `cargo test`, the cache file will be (re-)written. +If you don't want to regenerate the cache then you can use the following settings. +With these settings, the cache will only be read. + +``` +export TEST_SEQREPO_CACHE_MODE=read +export TEST_SEQREPO_CACHE_PATH=tests/data/seqrepo_cache.fasta +``` + +After either this, you can run the tests. + +``` +cargo test +``` + +## Creating Recuded UTA Databases + +The script `tests/data/data/bootstrap.sh` allows to easily build a reduced set of the UTA database given a list of genes. +The process is as follows: + +1. You edit `bootstrap.sh` to include the HGNC gene symbols of the transcripts that you want to use. +2. You run the bootstrap script. + This will download the given UTA dump and reduce it to the information related to these transcripts. + +``` +$ bootstrap.sh http://dl.biocommons.org/uta uta_20210129 +``` + +The `*.pgd.gz` file is added to the Git repository via `git-lfs` and in CI, this minimal database will be used. diff --git a/src/data/interface.rs b/src/data/interface.rs index 8073702..93c657f 100644 --- a/src/data/interface.rs +++ b/src/data/interface.rs @@ -214,7 +214,9 @@ pub trait Provider { /// # Arguments /// /// * `ac` -- accession - fn get_seq(&self, ac: &str) -> Result; + fn get_seq(&self, ac: &str) -> Result { + self.get_seq_part(ac, None, None) + } /// Return sequence part for the given accession. /// diff --git a/src/data/mod.rs b/src/data/mod.rs index da2d2ba..fcaceea 100644 --- a/src/data/mod.rs +++ b/src/data/mod.rs @@ -1,3 +1,4 @@ ///! Datatypes, interfaces, and data acess. pub mod interface; pub mod uta; +pub mod uta_sr; diff --git a/src/data/uta.rs b/src/data/uta.rs index f6c0a9e..4c66d0c 100644 --- a/src/data/uta.rs +++ b/src/data/uta.rs @@ -21,8 +21,7 @@ pub struct Config { /// URL with the connection string, e.g. /// `"postgresql://anonymous:anonymous@uta.biocommons.org/uta'"`. pub db_url: String, - /// The databaser schema to use, corresponds to the data version, e.g., - /// `uta_20210129`. + /// The databaser schema to use, corresponds to the data version, e.g., `uta_20210129`. pub db_schema: String, } @@ -155,6 +154,10 @@ impl TryFrom for TxMappingOptionsRecord { } } +/// This provider provides information from a UTA Postgres database only. +/// +/// The sequences are also read from the database which implies that no genome contig information +/// is available. Use `uta_sr::Provider` for a variant that is enabled to use a SeqRepo. pub struct Provider { /// Configuration for the access. config: Config, @@ -238,10 +241,6 @@ impl ProviderInterface for Provider { Ok(None) } - fn get_seq(&self, ac: &str) -> Result { - self.get_seq_part(ac, None, None) - } - fn get_seq_part( &self, ac: &str, diff --git a/src/data/uta_sr.rs b/src/data/uta_sr.rs new file mode 100644 index 0000000..c69c128 --- /dev/null +++ b/src/data/uta_sr.rs @@ -0,0 +1,258 @@ +//! Code for enabling UTA access where sequences from a SeqRepo. +//! +//! * https://github.com/biocommons/uta +//! * https://github.com/biocommons/biocommons.seqrepo +//! * https://github.com/bihealth/seqrepo-rs + +use std::path::PathBuf; +use std::rc::Rc; + +use crate::data::uta::{Config as UtaConfig, Provider as UtaProvider}; +use crate::data::{ + interface::GeneInfoRecord, interface::Provider as ProviderInterface, interface::TxExonsRecord, + interface::TxForRegionRecord, interface::TxIdentityInfo, interface::TxInfoRecord, + interface::TxMappingOptionsRecord, interface::TxSimilarityRecord, +}; +use seqrepo::{AliasOrSeqId, Interface as SeqRepoInterface, SeqRepo}; + +/// Configurationf or the `data::uta::Provider`. +#[derive(Debug, PartialEq, Clone)] +pub struct Config { + /// URL with the connection string, e.g. + /// `"postgresql://anonymous:anonymous@uta.biocommons.org/uta'"`. + pub db_url: String, + /// The databaser schema to use, corresponds to the data version, e.g., `uta_20210129`. + pub db_schema: String, + /// Path to the seqrepo directory, e.g., `/usr/local/share/seqrepo/latest`. The last path + /// component is the "instance" name. + pub seqrepo_path: String, +} + +/// This provider provides information from a UTA and a SeqRepo. +/// +/// Transcripts from a UTA Postgres database, sequences comes from a SeqRepo. This makes +/// genome contig information available in contrast to `data::uta::Provider`. +pub struct Provider { + inner: UtaProvider, + seqrepo: Rc, +} + +impl Provider { + /// Create a new provider that uses UTA and SeqRepo information from the given configuration. + /// + /// This uses `seqrepo::SeqRepo` for the sequence repository. You can inject any + /// `seqrepo::Interface` implementation using `Provider::with_seqrepo`. + pub fn new(config: Config) -> Result { + let seqrepo = PathBuf::from(&config.seqrepo_path); + let path = seqrepo + .parent() + .ok_or(anyhow::anyhow!( + "Could not get parent from {}", + &config.seqrepo_path + ))? + .to_str() + .unwrap() + .to_string(); + let instance = seqrepo + .file_name() + .ok_or(anyhow::anyhow!( + "Could not get basename from {}", + &config.seqrepo_path + ))? + .to_str() + .unwrap() + .to_string(); + + Ok(Self { + inner: UtaProvider::with_config(&UtaConfig { + db_url: config.db_url.clone(), + db_schema: config.db_schema, + })?, + seqrepo: Rc::new(SeqRepo::new(path, &instance)?), + }) + } + + /// Create a new provider allowing to inject a seqrepo. + pub fn with_seqrepo( + config: Config, + seqrepo: Rc, + ) -> Result { + Ok(Self { + inner: UtaProvider::with_config(&UtaConfig { + db_url: config.db_url.clone(), + db_schema: config.db_schema, + })?, + seqrepo, + }) + } +} + +impl ProviderInterface for Provider { + fn data_version(&self) -> &str { + self.inner.data_version() + } + + fn schema_version(&self) -> &str { + self.inner.schema_version() + } + + fn get_assembly_map( + &self, + assembly: crate::static_data::Assembly, + ) -> linked_hash_map::LinkedHashMap { + self.inner.get_assembly_map(assembly) + } + + fn get_gene_info(&self, hgnc: &str) -> Result { + self.inner.get_gene_info(hgnc) + } + + fn get_pro_ac_for_tx_ac(&self, tx_ac: &str) -> Result, anyhow::Error> { + self.inner.get_pro_ac_for_tx_ac(tx_ac) + } + + fn get_seq_part( + &self, + ac: &str, + begin: Option, + end: Option, + ) -> Result { + let aos = AliasOrSeqId::Alias { + value: ac.to_owned(), + namespace: None, + }; + self.seqrepo.fetch_sequence_part(&aos, begin, end) + } + + fn get_similar_transcripts( + &self, + tx_ac: &str, + ) -> Result, anyhow::Error> { + self.inner.get_similar_transcripts(tx_ac) + } + + fn get_tx_exons( + &self, + tx_ac: &str, + alt_ac: &str, + alt_aln_method: &str, + ) -> Result, anyhow::Error> { + self.inner.get_tx_exons(tx_ac, alt_ac, alt_aln_method) + } + + fn get_tx_for_gene(&self, gene: &str) -> Result, anyhow::Error> { + self.inner.get_tx_for_gene(gene) + } + + fn get_tx_for_region( + &self, + alt_ac: &str, + alt_aln_method: &str, + start_i: i32, + end_i: i32, + ) -> Result, anyhow::Error> { + self.inner + .get_tx_for_region(alt_ac, alt_aln_method, start_i, end_i) + } + + fn get_tx_identity_info(&self, tx_ac: &str) -> Result { + self.inner.get_tx_identity_info(tx_ac) + } + + fn get_tx_info( + &self, + tx_ac: &str, + alt_ac: &str, + alt_aln_method: &str, + ) -> Result { + self.inner.get_tx_info(tx_ac, alt_ac, alt_aln_method) + } + + fn get_tx_mapping_options( + &self, + tax_ac: &str, + ) -> Result, anyhow::Error> { + self.inner.get_tx_mapping_options(tax_ac) + } +} + +/// Code for helping setup of UTA providers, e.g., for setting up caching of SeqRepo results. +pub mod test_helpers { + use std::{path::PathBuf, rc::Rc}; + + use seqrepo::{ + CacheReadingSeqRepo, CacheWritingSeqRepo, Interface as SeqRepoInterface, SeqRepo, + }; + + use super::{Config, Provider, ProviderInterface}; + + /// Setup a UTA Provider with data source depending on environment variables. + /// + /// See README.md for information on environment variable setup. + pub fn build_provider() -> Result, anyhow::Error> { + log::debug!("building provider..."); + let db_url = std::env::var("TEST_UTA_DATABASE_URL") + .expect("Environment variable TEST_UTA_DATABASE_URL undefined!"); + let db_schema = std::env::var("TEST_UTA_DATABASE_SCHEMA") + .expect("Environment variable TEST_UTA_DATABASE_SCHEMA undefined!"); + let sr_cache_mode = std::env::var("TEST_SEQREPO_CACHE_MODE") + .expect("Environment variable TEST_SEQREPO_CACHE_MODE undefined!"); + let sr_cache_path = std::env::var("TEST_SEQREPO_CACHE_PATH") + .expect("Environment variable TEST_SEQREPO_CACHE_PATH undefined!"); + + let (seqrepo, seqrepo_path) = if sr_cache_mode == "read" { + log::debug!("reading provider..."); + let seqrepo: Rc = + Rc::new(CacheReadingSeqRepo::new(sr_cache_path)?); + log::debug!("construction done..."); + (seqrepo, "".to_string()) + } else if sr_cache_mode == "write" { + log::debug!("writing provider..."); + build_writing_sr(sr_cache_path)? + } else { + panic!("Invalid cache mode {}", &sr_cache_mode); + }; + log::debug!("now returning provider..."); + + Ok(Rc::new(Provider::with_seqrepo( + Config { + db_url, + db_schema, + seqrepo_path, + }, + seqrepo, + )?)) + } + + /// Helper that builds the cache writing SeqRepo with inner stock SeqRepo. + fn build_writing_sr( + sr_cache_path: String, + ) -> Result<(Rc, String), anyhow::Error> { + let seqrepo_path = std::env::var("TEST_SEQREPO_PATH") + .expect("Environment variable TEST_SEQREPO_PATH undefined!"); + let path_buf = PathBuf::from(seqrepo_path.clone()); + let path = path_buf + .parent() + .ok_or(anyhow::anyhow!( + "Could not get parent from {}", + &seqrepo_path + ))? + .to_str() + .unwrap() + .to_string(); + let instance = path_buf + .file_name() + .ok_or(anyhow::anyhow!( + "Could not get basename from {}", + &seqrepo_path + ))? + .to_str() + .unwrap() + .to_string(); + let seqrepo: Rc = Rc::new(CacheWritingSeqRepo::new( + SeqRepo::new(path, &instance)?, + sr_cache_path, + )?); + Ok((seqrepo, seqrepo_path)) + } +} diff --git a/src/lib.rs b/src/lib.rs index 917d65c..a81b443 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,20 +1,7 @@ pub mod data; pub mod mapper; +pub mod normalizer; pub mod parser; pub mod static_data; +pub(crate) mod utils; pub mod validator; - -pub fn add(left: usize, right: usize) -> usize { - left + right -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn it_works() { - let result = add(2, 2); - assert_eq!(result, 4); - } -} diff --git a/src/mapper/alignment.rs b/src/mapper/alignment.rs index 23cc4a4..77ab15e 100644 --- a/src/mapper/alignment.rs +++ b/src/mapper/alignment.rs @@ -118,7 +118,7 @@ impl Default for Config { /// Map HGVS location objects between genomic (g), non-coding (n) and cds (c) /// coordinates according to a CIGAR string. -pub struct AlignmentMapper { +pub struct Mapper { /// Configuration for alignment mapping. pub config: Config, /// Data provider to use for the mapping. @@ -138,13 +138,13 @@ pub struct AlignmentMapper { pub cigar_mapper: CigarMapper, } -impl AlignmentMapper { +impl Mapper { pub fn new( provider: Rc, tx_ac: &str, alt_ac: &str, alt_aln_method: &str, - ) -> Result { + ) -> Result { let (strand, gc_offset, cds_start_i, cds_end_i, tgt_len, cigar_mapper) = if alt_aln_method != "transcript" { let tx_info = provider.get_tx_info(tx_ac, alt_ac, alt_aln_method)?; @@ -230,7 +230,7 @@ impl AlignmentMapper { )); } - Ok(AlignmentMapper { + Ok(Mapper { config: Default::default(), provider, tx_ac: tx_ac.to_string(), @@ -246,11 +246,7 @@ impl AlignmentMapper { } /// Convert a genomic (g.) interval to a transcript (n.) interval. - pub fn g_to_n( - &self, - g_interval: &GenomeInterval, - _strict_bounds: bool, - ) -> Result, anyhow::Error> { + pub fn g_to_n(&self, g_interval: &GenomeInterval) -> Result, anyhow::Error> { if let GenomeInterval { start: Some(begin), end: Some(end), @@ -317,11 +313,7 @@ impl AlignmentMapper { } /// Convert a transcript (n.) interval to a genomic (g.) interval. - pub fn n_to_g( - &self, - n_interval: &TxInterval, - strict_bounds: bool, - ) -> Result, anyhow::Error> { + pub fn n_to_g(&self, n_interval: &TxInterval) -> Result, anyhow::Error> { let frs = hgvs_to_zbc(n_interval.start.base); let start_offset = n_interval.start.offset.unwrap_or(0); let fre = hgvs_to_zbc(n_interval.end.base); @@ -341,10 +333,10 @@ impl AlignmentMapper { // Obtain the genomic range start (grs) and end (gre). let grs = self .cigar_mapper - .map_tgt_to_ref(frs, "start", strict_bounds)?; + .map_tgt_to_ref(frs, "start", self.config.strict_bounds)?; let gre = self .cigar_mapper - .map_tgt_to_ref(fre, "start", strict_bounds)?; + .map_tgt_to_ref(fre, "start", self.config.strict_bounds)?; let (grs_pos, gre_pos) = (grs.pos + self.gc_offset + 1, gre.pos + self.gc_offset + 1); let (gs, ge) = (grs_pos + start_offset, gre_pos + end_offset); @@ -386,11 +378,7 @@ impl AlignmentMapper { } /// Convert a transcript (n.) interval to a CDS (c.) interval. - pub fn n_to_c( - &self, - n_interval: &TxInterval, - strict_bounds: bool, - ) -> Result { + pub fn n_to_c(&self, n_interval: &TxInterval) -> Result { if self.cds_start_i.is_none() { return Err(anyhow::anyhow!( "CDS is undefined for {}; cannot map to c. coordinate (non-coding transcript?)", @@ -398,7 +386,9 @@ impl AlignmentMapper { )); } - if strict_bounds && (n_interval.start.base <= 0 || n_interval.end.base > self.tgt_len) { + if self.config.strict_bounds + && (n_interval.start.base <= 0 || n_interval.end.base > self.tgt_len) + { return Err(anyhow::anyhow!( "The given coordinate is outside the bounds of the reference sequence." )); @@ -410,7 +400,7 @@ impl AlignmentMapper { }) } - pub fn pos_c_to_n(&self, pos: &CdsPos, strict_bounds: bool) -> Result { + pub fn pos_c_to_n(&self, pos: &CdsPos) -> Result { let cds_start_i = self.cds_start_i.unwrap(); let cds_end_i = self.cds_end_i.unwrap(); @@ -434,7 +424,7 @@ impl AlignmentMapper { n }; - if (n <= 0 || n > self.tgt_len) && strict_bounds { + if (n <= 0 || n > self.tgt_len) && self.config.strict_bounds { Err(anyhow::anyhow!("c.{:?} coordinate is out of boounds", pos)) } else { Ok(TxPos { @@ -445,11 +435,7 @@ impl AlignmentMapper { } /// Convert a a CDS (c.) interval to a transcript (n.) interval. - pub fn c_to_n( - &self, - c_interval: &CdsInterval, - strict_bounds: bool, - ) -> Result { + pub fn c_to_n(&self, c_interval: &CdsInterval) -> Result { if self.cds_start_i.is_none() { return Err(anyhow::anyhow!( "CDS is undefined for {}; cannot map to c. coordinate (non-coding transcript?)", @@ -457,35 +443,29 @@ impl AlignmentMapper { )); } - let n_start = self.pos_c_to_n(&c_interval.start, strict_bounds); - let n_end = self.pos_c_to_n(&c_interval.end, strict_bounds); - Ok(TxInterval { + let n_start = self.pos_c_to_n(&c_interval.start); + let n_end = self.pos_c_to_n(&c_interval.end); + let result = TxInterval { start: n_start?, end: n_end?, - }) + }; + log::debug!("c_to_n({}) = {}", c_interval, &result); + Ok(result) } /// Convert a genomic (g.) interval to a CDS (c.) interval. - pub fn g_to_c( - &self, - g_interval: &GenomeInterval, - strict_bounds: bool, - ) -> Result, anyhow::Error> { - let n_interval = self.g_to_n(g_interval, strict_bounds)?; + pub fn g_to_c(&self, g_interval: &GenomeInterval) -> Result, anyhow::Error> { + let n_interval = self.g_to_n(g_interval)?; Ok(match &n_interval { - Mu::Certain(n_interval) => Mu::Certain(self.n_to_c(n_interval, strict_bounds)?), - Mu::Uncertain(n_interval) => Mu::Uncertain(self.n_to_c(n_interval, strict_bounds)?), + Mu::Certain(n_interval) => Mu::Certain(self.n_to_c(n_interval)?), + Mu::Uncertain(n_interval) => Mu::Uncertain(self.n_to_c(n_interval)?), }) } /// Convert a CDS (c.) interval to a genomic (g.) interval. - pub fn c_to_g( - &self, - c_interval: &CdsInterval, - strict_bounds: bool, - ) -> Result, anyhow::Error> { - let n_interval = self.c_to_n(c_interval, strict_bounds)?; - let g_interval = self.n_to_g(&n_interval, strict_bounds)?; + pub fn c_to_g(&self, c_interval: &CdsInterval) -> Result, anyhow::Error> { + let n_interval = self.c_to_n(c_interval)?; + let g_interval = self.n_to_g(&n_interval)?; Ok(g_interval) } @@ -495,7 +475,7 @@ impl AlignmentMapper { } /// Return whether the given genome interval is in bounds. - pub fn is_g_interval_in_bounds(&self, g_interval: GenomeInterval) -> bool { + pub fn is_g_interval_in_bounds(&self, g_interval: &GenomeInterval) -> bool { let grs = g_interval.start.unwrap() - 1 - self.gc_offset; let gre = g_interval.end.unwrap() - 1 - self.gc_offset; grs >= 0 && gre <= self.cigar_mapper.ref_len @@ -516,7 +496,7 @@ mod test { parser::{CdsFrom, CdsInterval, CdsPos, GenomeInterval, Mu, TxInterval, TxPos}, }; - use super::{build_tx_cigar, none_if_default, AlignmentMapper}; + use super::{build_tx_cigar, none_if_default, Mapper}; #[test] fn build_tx_cigar_empty() { @@ -610,55 +590,44 @@ mod test { // unknown sequences - assert!(AlignmentMapper::new(provider.clone(), "bogus", "NM_033089.6", "splign").is_err()); - assert!( - AlignmentMapper::new(provider.clone(), "bogus", "NM_033089.6", "transcript").is_err() - ); - assert!(AlignmentMapper::new(provider.clone(), "NM_033089.6", "bogus", "splign").is_err()); - assert!( - AlignmentMapper::new(provider.clone(), "NM_000051.3", "NC_000011.9", "bogus").is_err() - ); + assert!(Mapper::new(provider.clone(), "bogus", "NM_033089.6", "splign").is_err()); + assert!(Mapper::new(provider.clone(), "bogus", "NM_033089.6", "transcript").is_err()); + assert!(Mapper::new(provider.clone(), "NM_033089.6", "bogus", "splign").is_err()); + assert!(Mapper::new(provider.clone(), "NM_000051.3", "NC_000011.9", "bogus").is_err()); // invalid intervals { - let am = - AlignmentMapper::new(provider.clone(), "NM_000348.3", "NC_000002.11", "splign")?; + let am = Mapper::new(provider.clone(), "NM_000348.3", "NC_000002.11", "splign")?; assert!(am - .n_to_c( - &TxInterval { - start: TxPos { - base: -1, - offset: None - }, - end: TxPos { - base: -1, - offset: None - }, + .n_to_c(&TxInterval { + start: TxPos { + base: -1, + offset: None }, - true - ) + end: TxPos { + base: -1, + offset: None + }, + },) .is_err()); } { - let am = AlignmentMapper::new(provider, "NM_000348.3", "NC_000002.11", "splign")?; + let am = Mapper::new(provider, "NM_000348.3", "NC_000002.11", "splign")?; assert!(am - .c_to_n( - &CdsInterval { - start: CdsPos { - base: 99999, - offset: None, - cds_from: CdsFrom::Start - }, - end: CdsPos { - base: 99999, - offset: None, - cds_from: CdsFrom::Start - }, + .c_to_n(&CdsInterval { + start: CdsPos { + base: 99999, + offset: None, + cds_from: CdsFrom::Start }, - true - ) + end: CdsPos { + base: 99999, + offset: None, + cds_from: CdsFrom::Start + }, + },) .is_err()); } @@ -673,12 +642,12 @@ mod test { ) -> Result<(), anyhow::Error> { let config = get_config(); let provider = Rc::new(Provider::with_config(&config)?); - let mapper = AlignmentMapper::new(provider, tx_ac, alt_ac, "splign")?; + let mapper = Mapper::new(provider, tx_ac, alt_ac, "splign")?; for (g_interval, n_interval, c_interval) in cases { assert_eq!( c_interval, - mapper.g_to_c(g_interval, true)?.inner(), + mapper.g_to_c(g_interval)?.inner(), "{}~{} {} mapper.g_to_c", tx_ac, alt_ac, @@ -686,7 +655,7 @@ mod test { ); assert_eq!( c_interval, - &mapper.n_to_c(n_interval, true)?, + &mapper.n_to_c(n_interval)?, "{}~{} {} mapper.n_to_c", tx_ac, alt_ac, @@ -695,7 +664,7 @@ mod test { assert_eq!( g_interval, - mapper.c_to_g(c_interval, true)?.inner(), + mapper.c_to_g(c_interval)?.inner(), "{}~{} {} mapper.c_to_g", tx_ac, alt_ac, @@ -703,7 +672,7 @@ mod test { ); assert_eq!( Mu::Certain(g_interval.clone()), - mapper.n_to_g(n_interval, true)?, + mapper.n_to_g(n_interval)?, "{}~{} {} mapper.n_to_g", tx_ac, alt_ac, @@ -712,7 +681,7 @@ mod test { assert_eq!( n_interval, - &mapper.c_to_n(c_interval, true)?, + &mapper.c_to_n(c_interval)?, "{}~{} {} mapper.c_to_n", tx_ac, alt_ac, @@ -720,7 +689,7 @@ mod test { ); assert_eq!( Mu::Certain(n_interval.clone()), - mapper.g_to_n(g_interval, true)?, + mapper.g_to_n(g_interval)?, "{}~{} {} mapper.g_to_n", tx_ac, alt_ac, diff --git a/src/mapper/variant.rs b/src/mapper/variant.rs index cb1bc35..80f8e9b 100644 --- a/src/mapper/variant.rs +++ b/src/mapper/variant.rs @@ -1,12 +1,71 @@ -//! Code for variant mapping. +//! Code for mapping variants between sequences. -use std::rc::Rc; +use std::{ops::Range, rc::Rc}; +use log::{debug, info}; + +use super::alignment::Mapper as AlignmentMapper; use crate::{ data::interface::Provider, - parser::HgvsVariant, + normalizer::{self, Normalizer}, + parser::{ + Accession, CdsInterval, CdsLocEdit, CdsPos, GeneSymbol, GenomeInterval, GenomeLocEdit, + HgvsVariant, Mu, NaEdit, TxInterval, TxLocEdit, TxPos, + }, + utils::revcomp, validator::{ValidationLevel, Validator}, }; +use std::ops::Deref; + +/// Implementation details of `Mapper::c_to_p` +mod c_to_p_impl { + use std::rc::Rc; + + use crate::{data::interface::Provider, parser::HgvsVariant}; + + pub struct RefTranscriptData { + /// Data provider. + pub provider: Rc, + /// Transcript accession. + pub tx_ac: String, + /// Protein accession. + pub prot_ac: Option, + } + + pub struct AltSeqBuilder<'a> { + pub reference_data: &'a RefTranscriptData, + } + + pub struct AltData {} + + impl<'a> AltSeqBuilder<'a> { + pub fn new(reference_data: &'a RefTranscriptData) -> Self { + Self { reference_data } + } + + pub fn build_altseq(&self) -> Result, anyhow::Error> { + todo!() + } + } + + pub struct AltSeqToHgvsp<'a> { + pub var_c: &'a HgvsVariant, + pub reference_data: &'a RefTranscriptData, + } + + impl<'a> AltSeqToHgvsp<'a> { + pub fn new(var_c: &'a HgvsVariant, reference_data: &'a RefTranscriptData) -> Self { + Self { + var_c, + reference_data, + } + } + + pub fn build_hgvsp(&self) -> Result { + todo!() + } + } +} /// Configuration for Mapper. /// @@ -17,6 +76,7 @@ pub struct Config { pub strict_validation: bool, pub prevalidation_level: ValidationLevel, pub add_gene_symbol: bool, + pub strict_bounds: bool, } impl Default for Config { @@ -24,57 +84,402 @@ impl Default for Config { Self { replace_reference: true, strict_validation: false, - prevalidation_level: ValidationLevel::Extrinsic, + prevalidation_level: ValidationLevel::Full, add_gene_symbol: false, + strict_bounds: true, } } } +/// Projects variants between sequences using `alignment::Mapper`. pub struct Mapper { config: Config, provider: Rc, - validator: Box, + validator: Rc, } +/// Maps SequenceVariant objects between g., n., r., c., and p. representations. +/// +/// g⟷{c,n,r} projections are similar in that c, n, and r variants +/// may use intronic coordinates. There are two essential differences +/// that distinguish the three types: +/// +/// * Sequence start: In n and r variants, position 1 is the sequence +/// start; in c variants, 1 is the transcription start site. +/// * Alphabet: In n and c variants, sequences are DNA; in +/// r. variants, sequences are RNA. +/// +/// This differences are summarized in this diagram:: +/// +/// ```text +/// g ----acgtatgcac--gtctagacgt---- ----acgtatgcac--gtctagacgt---- ----acgtatgcac--gtctagacgt---- +/// \ \/ / \ \/ / \ \/ / +/// c acgtATGCACGTCTAGacgt n acgtatgcacgtctagacgt r acguaugcacgucuagacgu +/// 1 1 1 +/// p MetHisValTer +/// +/// The g excerpt and exon structures are identical. The g⟷n +/// transformation, which is the most basic, accounts for the offset +/// of the aligned sequences (shown with "1") and the exon structure. +/// The g⟷c transformation is akin to g⟷n transformation, but +/// requires an addition offset to account for the translation start +/// site (c.1). The CDS in uppercase. The g⟷c transformation is +/// akin to g⟷n transformation with a change of alphabet. +/// +/// Therefore, this this code uses g⟷n as the core transformation +/// between genomic and c, n, and r variants: All c⟷g and r⟷g +/// transformations use n⟷g after accounting for the above +/// differences. For example, c_to_g accounts for the transcription +/// start site offset, then calls n_to_g. impl Mapper { - fn new(config: &Config, provider: Rc) -> Mapper { + pub fn new(config: &Config, provider: Rc) -> Mapper { let validator = config .prevalidation_level .validator(config.strict_validation, provider.clone()); + + let _n_config = normalizer::Config::default(); + Mapper { config: config.clone(), - provider, - validator, + provider: provider.clone(), + validator: validator.clone(), } } - fn config(&self) -> &Config { + pub fn config(&self) -> &Config { &self.config } - /// Convert from genome to transcription. + /// Return a copy of the internal provider. + pub fn provider(&self) -> Rc { + self.provider.clone() + } + + /// Obtain new `alignment::Mapper` for the given arguments, possibly caching results. + fn build_alignment_mapper( + &self, + tx_ac: &str, + alt_ac: &str, + alt_aln_method: &str, + ) -> Result { + // TODO: implement caching + AlignmentMapper::new(self.provider.clone(), tx_ac, alt_ac, alt_aln_method) + } + + /// Convert from genome (g.) variant to transcript variant (g. or n.). /// /// # Args /// /// * `var_g` -- `HgvsVariant::GenomeVariant` to project /// * `tx_ac` -- accession of transcript to project to /// * `alt_al_method` -- alignment method, e.g., `splign` - fn g_to_t( + pub fn g_to_t( &self, - var_g: HgvsVariant, - _tx_ac: &str, - _alt_aln_method: &str, - ) -> Result<(), anyhow::Error> { + var_t: &HgvsVariant, + alt_ac: &str, + alt_aln_method: &str, + ) -> Result { + self.validator.validate(var_t)?; + let var_t = self.replace_reference(var_t.clone())?; + match var_t { + HgvsVariant::TxVariant { .. } => self.n_to_g(&var_t, alt_ac, alt_aln_method), + HgvsVariant::CdsVariant { .. } => self.c_to_g(&var_t, alt_ac, alt_aln_method), + _ => { + return Err(anyhow::anyhow!( + "Expected transcript or CDS variant but received {}", + &var_t + )) + } + } + } + + /// Convert from genome (g.) variant to transcript variant (n.). + /// + /// # Args + /// + /// * `var_g` -- `HgvsVariant::GenomeVariant` to project + /// * `tx_ac` -- accession of transcript to project to + /// * `alt_al_method` -- alignment method, e.g., `"splign"` + pub fn g_to_n( + &self, + var_g: &HgvsVariant, + tx_ac: &str, + alt_aln_method: &str, + ) -> Result { + self.validator.validate(var_g)?; + let var_g = self.replace_reference(var_g.clone())?; if let HgvsVariant::GenomeVariant { - accession: _, + accession, + loc_edit, + gene_symbol, + } = &var_g + { + let mapper = self.build_alignment_mapper(tx_ac, &accession.value, alt_aln_method)?; + + let var_g = if mapper.strand == -1 + && !self.config.strict_bounds + && !mapper.is_g_interval_in_bounds(loc_edit.loc.inner()) + { + info!("Renormalizing out-of-bounds minus strand variant on genomic sequenc"); + Normalizer::new( + self, + self.provider.clone(), + self.validator.clone(), + Default::default(), + ) + .normalize(&var_g)? + } else { + var_g.clone() + }; + + let pos_n = mapper.g_to_n(loc_edit.loc.inner())?; + let pos_n = Mu::from( + pos_n.inner(), + loc_edit.loc.is_certain() && pos_n.is_certain(), + ); + let (pos_n, edit_n) = if let Mu::Certain(pos_n) = pos_n { + let edit_n = self.convert_edit_check_strand(mapper.strand, &loc_edit.edit)?; + if let NaEdit::Ins { .. } = edit_n.inner() { + if pos_n.start.offset.is_none() + && pos_n.end.offset.is_none() + && pos_n.end.base - pos_n.start.base > 1 + { + ( + Mu::Certain(TxInterval { + start: TxPos { + base: pos_n.start.base + 1, + ..pos_n.start + }, + end: TxPos { + base: pos_n.end.base - 1, + ..pos_n.end + }, + }), + Mu::Certain(NaEdit::Ins { + alternative: "".to_string(), + }), + ) + } else { + (Mu::Certain((*pos_n).clone()), edit_n) + } + } else { + (Mu::Certain((*pos_n).clone()), edit_n) + } + } else { + let pos_g = mapper.n_to_g(pos_n.inner())?; + let edit_n = NaEdit::RefAlt { + reference: "".to_string(), + alternative: self.get_altered_sequence( + mapper.strand, + pos_g.inner().clone().try_into()?, + &var_g, + )?, + }; + (Mu::Uncertain((*pos_n.inner()).clone()), Mu::Certain(edit_n)) + }; + + // the following is not needed? + // pos_n.uncertain = var_g.posedit.pos.uncertain + + let var_n = HgvsVariant::TxVariant { + accession: Accession::new(tx_ac), + gene_symbol: self.fetch_gene_symbol(tx_ac, gene_symbol)?, + loc_edit: TxLocEdit { + loc: pos_n.clone(), + edit: edit_n, + }, + }; + + let var_n = if self.config.replace_reference + && pos_n.inner().start.base >= 0 + && pos_n.inner().end.base < mapper.tgt_len + { + self.replace_reference(var_n)? + } else { + var_n + }; + + Ok(var_n) + } else { + Err(anyhow::anyhow!( + "Expected a GenomeVariant but received {}", + &var_g + )) + } + } + + /// Convert from transcript variant (g.) variant to genome variant (n.). + /// + /// # Args + /// + /// * `var_n` -- `HgvsVariant::TxVariant` to project + /// * `alt_ac` -- alternative contig accession + /// * `alt_al_method` -- alignment method, e.g., `"splign"` + pub fn n_to_g( + &self, + var_n: &HgvsVariant, + alt_ac: &str, + alt_aln_method: &str, + ) -> Result { + self.validator.validate(var_n)?; + let var_n = self.replace_reference(var_n.clone())?; + if let HgvsVariant::TxVariant { + accession, gene_symbol: _, - loc_edit: _, + loc_edit, + } = &var_n + { + let mapper = self.build_alignment_mapper(&accession.value, alt_ac, alt_aln_method)?; + let pos_g = mapper.n_to_g(loc_edit.loc.inner())?; + + let (pos_g, edit_g) = if let Mu::Certain(pos_g) = pos_g { + let edit_g = self.convert_edit_check_strand(mapper.strand, &loc_edit.edit)?; + if let (NaEdit::Ins { .. }, Some(end), Some(start)) = + (edit_g.inner(), pos_g.end, pos_g.start) + { + if end - start > 1 { + ( + Mu::Certain(GenomeInterval { + start: Some(start + 1), + end: Some(end - 1), + }), + Mu::from( + NaEdit::Ins { + alternative: "".to_string(), + }, + edit_g.is_certain(), + ), + ) + } else { + (Mu::Certain(pos_g), edit_g) + } + } else { + (Mu::Certain(pos_g), edit_g) + } + } else { + // variant at alignment gap + let pos_n = mapper.g_to_n(pos_g.inner())?; + let edit_g = NaEdit::RefAlt { + reference: "".to_string(), + alternative: self.get_altered_sequence( + mapper.strand, + pos_n.inner().clone().into(), + &var_n, + )?, + }; + (pos_g, Mu::Certain(edit_g)) + }; + + // the following is not needed? + // pos_g.uncertain = var_n.posedit.pos.uncertain + + let var_g = HgvsVariant::GenomeVariant { + accession: Accession::new(alt_ac), + gene_symbol: None, + loc_edit: GenomeLocEdit { + loc: pos_g, + edit: edit_g, + }, + }; + + let var_g = if self.config.replace_reference { + self.replace_reference(var_g)? + } else { + var_g + }; + + // No gene symbol for g. variants (actually, *should* for NG, but no way to distinguish) + + Ok(var_g) + } else { + Err(anyhow::anyhow!( + "Expected a TxVariant but received {}", + &var_n + )) + } + } + + /// Convert from genome (g.) variant to CDS variant (c.). + /// + /// # Args + /// + /// * `var_g` -- `HgvsVariant::GenomeVariant` to project + /// * `tx_ac` -- accession of transcript to project to + /// * `alt_al_method` -- alignment method, e.g., `"splign"` + pub fn g_to_c( + &self, + var_g: &HgvsVariant, + tx_ac: &str, + alt_aln_method: &str, + ) -> Result { + self.validator.validate(var_g)?; + let var_g = self.replace_reference(var_g.clone())?; + if let HgvsVariant::GenomeVariant { + accession, + gene_symbol, + loc_edit, } = &var_g { - self.validator.validate(&var_g)?; - let _var_g = var_g.fill_ref(self.provider.as_ref())?; + let mapper = self.build_alignment_mapper(tx_ac, &accession.value, alt_aln_method)?; + let pos_c = mapper.g_to_c(loc_edit.loc.inner())?; + + let (pos_c, edit_c) = if let Mu::Certain(pos_c) = pos_c { + let edit_c = self.convert_edit_check_strand(mapper.strand, &loc_edit.edit)?; + if let NaEdit::Ins { .. } = edit_c.inner() { + if pos_c.start.offset.is_none() + && pos_c.end.offset.is_none() + && pos_c.end.base - pos_c.start.base > 1 + { + ( + Mu::Certain(CdsInterval { + start: CdsPos { + base: pos_c.start.base + 1, + ..pos_c.start + }, + end: CdsPos { + base: pos_c.end.base - 1, + ..pos_c.end + }, + }), + Mu::Certain(NaEdit::Ins { + alternative: "".to_string(), + }), + ) + } else { + (Mu::Certain(pos_c), edit_c) + } + } else { + (Mu::Certain(pos_c), edit_c) + } + } else { + let pos_g = mapper.c_to_g(pos_c.inner())?; + let edit_c = NaEdit::RefAlt { + reference: "".to_string(), + alternative: self.get_altered_sequence( + mapper.strand, + pos_g.inner().clone().try_into()?, + &var_g, + )?, + }; + (Mu::Uncertain((*pos_c.inner()).clone()), Mu::Certain(edit_c)) + }; - Ok(()) + let var_c = HgvsVariant::CdsVariant { + accession: Accession::new(tx_ac), + gene_symbol: self.fetch_gene_symbol(tx_ac, gene_symbol)?, + loc_edit: CdsLocEdit { + loc: pos_c, + edit: edit_c, + }, + }; + + let var_c = if self.config.replace_reference { + self.replace_reference(var_c)? + } else { + var_c + }; + + Ok(var_c) } else { Err(anyhow::anyhow!( "Expected a GenomeVariant but received {}", @@ -82,4 +487,611 @@ impl Mapper { )) } } + + /// Convert from CDS variant (c.) to genome variant (g.). + /// + /// # Args + /// + /// * `var_c` -- `HgvsVariant::CdsVariant` to project + /// * `alt_ac` -- alternative contig accession + /// * `alt_al_method` -- alignment method, e.g., `"splign"` + pub fn c_to_g( + &self, + var_c: &HgvsVariant, + alt_ac: &str, + alt_aln_method: &str, + ) -> Result { + self.validator.validate(var_c)?; + let var_c = self.replace_reference(var_c.clone())?; + if let HgvsVariant::CdsVariant { + accession, + gene_symbol, + loc_edit, + } = &var_c + { + let mapper = self.build_alignment_mapper(&accession.value, alt_ac, alt_aln_method)?; + let pos_g = mapper.c_to_g(loc_edit.loc.inner())?; + + let (pos_g, edit_g) = if let Mu::Certain(pos_g) = pos_g { + let edit_g = self.convert_edit_check_strand(mapper.strand, &loc_edit.edit)?; + if let (NaEdit::Ins { .. }, Some(end), Some(start)) = + (edit_g.inner(), pos_g.end, pos_g.start) + { + if end - start > 1 { + ( + Mu::Certain(GenomeInterval { + start: Some(start + 1), + end: Some(end - 1), + }), + Mu::from( + NaEdit::Ins { + alternative: "".to_string(), + }, + edit_g.is_certain(), + ), + ) + } else { + (Mu::Certain(pos_g), edit_g) + } + } else { + (Mu::Certain(pos_g), edit_g) + } + } else { + // variant at alignment gap + let pos_n = mapper.g_to_n(pos_g.inner())?; + let edit_n = NaEdit::RefAlt { + reference: "".to_string(), + alternative: self.get_altered_sequence( + mapper.strand, + pos_n.inner().clone().into(), + &var_c, + )?, + }; + (pos_g, Mu::Certain(edit_n)) + }; + + let var_g = HgvsVariant::GenomeVariant { + accession: accession.clone(), + gene_symbol: self.fetch_gene_symbol(accession.deref().as_str(), gene_symbol)?, + loc_edit: GenomeLocEdit { + loc: pos_g, + edit: edit_g, + }, + }; + + let var_g = if self.config.replace_reference { + self.replace_reference(var_g)? + } else { + var_g + }; + + Ok(var_g) + } else { + Err(anyhow::anyhow!( + "Expected a CdsVariant but received {}", + &var_c + )) + } + } + + /// Convert from transcript (c. or n.) to genome (g.) variant. + /// + /// # Args + /// + /// * `var_t` -- `HgvsVariant::TxVariant` or `HgvsVariant::CdsVariant` to project + /// * `alt_ac` -- accession of alternativ esequence + /// * `alt_al_method` -- alignment method, e.g., `"splign"` + pub fn t_to_g( + &self, + var_t: &HgvsVariant, + alt_ac: &str, + alt_aln_method: &str, + ) -> Result { + self.validator.validate(var_t)?; + let var_t = self.replace_reference(var_t.clone())?; + match var_t { + HgvsVariant::TxVariant { .. } => self.n_to_g(&var_t, alt_ac, alt_aln_method), + HgvsVariant::CdsVariant { .. } => self.c_to_g(&var_t, alt_ac, alt_aln_method), + _ => { + return Err(anyhow::anyhow!( + "Expected transcript or CDS variant but received {}", + &var_t + )) + } + } + } + + /// Convert from CDS variant (c.) to transcript variant (n.). + /// + /// # Args + /// + /// * `var_c` -- `HgvsVariant::CdsVariant` to project + pub fn c_to_n(&self, var_c: &HgvsVariant) -> Result { + log::debug!("c_to_n({})", var_c); + self.validator.validate(var_c)?; + let var_c = self.replace_reference(var_c.clone())?; + if let HgvsVariant::CdsVariant { + accession, + gene_symbol, + loc_edit, + } = &var_c + { + let mapper = + self.build_alignment_mapper(&accession.value, &accession.value, "transcript")?; + + let pos_n = mapper.c_to_n(loc_edit.loc.inner())?; + let var_n = HgvsVariant::TxVariant { + accession: accession.clone(), + gene_symbol: self.fetch_gene_symbol(accession.deref().as_str(), gene_symbol)?, + loc_edit: TxLocEdit { + loc: Mu::from(pos_n, loc_edit.loc.is_certain()), + edit: loc_edit.edit.clone(), + }, + }; + + let var_n = if self.config.replace_reference { + self.replace_reference(var_n)? + } else { + var_n + }; + + log::debug!("c_to_n({}) = {}", var_c, &var_n); + Ok(var_n) + } else { + Err(anyhow::anyhow!( + "Expected a CdsVariant but received {}", + &var_c + )) + } + } + + /// Convert from transcript variant (n.) to CDS variant (c.). + /// + /// # Args + /// + /// * `var_n` -- `HgvsVariant::TxVariant` to project + pub fn n_to_c(&self, var_n: &HgvsVariant) -> Result { + self.validator.validate(var_n)?; + let var_n = self.replace_reference(var_n.clone())?; + if let HgvsVariant::TxVariant { + accession, + gene_symbol, + loc_edit, + } = &var_n + { + let mapper = + self.build_alignment_mapper(&accession.value, &accession.value, "transcript")?; + let pos_c = mapper.n_to_c(loc_edit.loc.inner())?; + + let var_c = HgvsVariant::CdsVariant { + accession: accession.clone(), + gene_symbol: self.fetch_gene_symbol(accession.deref().as_str(), gene_symbol)?, + loc_edit: CdsLocEdit { + loc: Mu::from(pos_c, loc_edit.loc.is_certain()), + edit: loc_edit.edit.clone(), + }, + }; + + let var_c = if self.config.replace_reference { + self.replace_reference(var_c)? + } else { + var_c + }; + + Ok(var_c) + } else { + Err(anyhow::anyhow!( + "Expected a TxVariant but received {}", + &var_n + )) + } + } + + /// Convert from CDS variant (c.) to protein variant (p.). + /// + /// # Args + /// + /// * `var_c` -- `HgvsVariant::TxVariant` to project + /// * `pro_ac` -- Protein accession + pub fn c_to_p( + &self, + var_c: &HgvsVariant, + prot_ac: Option<&str>, + ) -> Result { + use c_to_p_impl::*; + + if let HgvsVariant::CdsVariant { + accession, + gene_symbol: _, + loc_edit: _, + } = &var_c + { + self.validator.validate(var_c)?; + + let reference_data = RefTranscriptData { + provider: self.provider.clone(), + tx_ac: accession.deref().clone(), + prot_ac: prot_ac.map(|s| s.to_string()), + }; + let builder = AltSeqBuilder::new(&reference_data); + + // NB: the following comment is from the original code. + // TODO: handle case where you get 2+ alt sequences back; currently get lis tof 1 element + // loop structure implemented to handle this, but doesn't really do anything currently. + + let var_ps: Result, anyhow::Error> = builder + .build_altseq()? + .into_iter() + .map(|_alt_data| { + let builder = AltSeqToHgvsp::new(var_c, &reference_data); + builder.build_hgvsp() + }) + .collect(); + let var_p = var_ps?.into_iter().next().unwrap(); + + let var_p = if let HgvsVariant::ProtVariant { + accession, + gene_symbol, + loc_edit, + } = var_p + { + HgvsVariant::ProtVariant { + gene_symbol: self + .fetch_gene_symbol(accession.deref().as_str(), &gene_symbol)?, + accession, + loc_edit, + } + } else { + return Err(anyhow::anyhow!( + "This cannot happen; must have ProtVariant here." + )); + }; + + Ok(var_p) + } else { + Err(anyhow::anyhow!( + "Expected a CdsVariant but received {}", + &var_c + )) + } + } + + fn get_altered_sequence( + &self, + strand: i16, + interval: Range, + var: &HgvsVariant, + ) -> Result { + let mut seq = self.provider.as_ref().get_seq_part( + var.accession(), + Some(interval.start.try_into()?), + Some(interval.end.try_into()?), + )?; + + let r = var.loc_range().ok_or(anyhow::anyhow!( + "Cannot get altered sequence for missing positions" + ))?; + let r = ((r.start - interval.start) as usize)..((r.end - interval.start) as usize); + + let na_edit = var + .na_edit() + .ok_or(anyhow::anyhow!("Variant is missing nucleic acid edit"))?; + + match na_edit { + NaEdit::RefAlt { alternative, .. } | NaEdit::NumAlt { alternative, .. } => { + seq.replace_range(r, alternative) + } + NaEdit::DelRef { .. } | NaEdit::DelNum { .. } => seq.replace_range(r, ""), + NaEdit::Ins { alternative } => seq.replace_range(r, alternative), + NaEdit::Dup { .. } => { + let seg = seq[r.clone()].to_string(); + seq.replace_range(r.end..r.end, &seg); + } + NaEdit::InvRef { .. } | NaEdit::InvNum { .. } => { + let rc = revcomp(&seq[r.clone()]); + seq.replace_range(r, &rc); + } + } + + Ok(if strand == -1 { revcomp(&seq) } else { seq }) + } + + /// Convert an edit from one type to another, based on the strand and type. + fn convert_edit_check_strand( + &self, + strand: i16, + edit: &Mu, + ) -> Result, anyhow::Error> { + let result = if strand == 1 { + edit.inner().clone() + } else { + match edit.inner() { + NaEdit::RefAlt { + reference, + alternative, + } => NaEdit::RefAlt { + reference: revcomp(reference), + alternative: revcomp(alternative), + }, + NaEdit::NumAlt { count, alternative } => NaEdit::NumAlt { + count: *count, + alternative: revcomp(alternative), + }, + NaEdit::DelRef { reference } => NaEdit::DelRef { + reference: revcomp(reference), + }, + NaEdit::Ins { alternative } => NaEdit::Ins { + alternative: revcomp(alternative), + }, + NaEdit::Dup { reference } => NaEdit::Dup { + reference: revcomp(reference), + }, + NaEdit::InvRef { reference } => NaEdit::InvRef { + reference: revcomp(reference), + }, + NaEdit::DelNum { count } => NaEdit::DelNum { count: *count }, + NaEdit::InvNum { count } => NaEdit::InvNum { count: *count }, + } + }; + Ok(Mu::from(result, edit.is_certain())) + } + + /// Fetch reference sequence for variant and return updated `HgvsVariant` if necessary. + pub(crate) fn replace_reference(&self, var: HgvsVariant) -> Result { + log::debug!("replace_reference({})", &var); + match &var { + HgvsVariant::ProtVariant { .. } => Err(anyhow::anyhow!( + "Can only update reference for c, g, m, n, r" + )), + _ => Ok(()), + }?; + + log::debug!("???"); + + if let Some(NaEdit::Ins { .. }) = var.na_edit() { + // Insertions have no reference sequence (zero-width); return as-is. + return Ok(var); + } + + log::debug!("111"); + + if var.spans_intron() { + debug!( + "Can't update reference sequence for intronic variant {}", + var + ); + return Ok(var); + } + + log::debug!("???"); + + // For c. variants, we need coordinates on underlying sequence. + let (r, ac): (Range<_>, _) = match &var { + HgvsVariant::CdsVariant { + accession, + loc_edit, + .. + } => { + let mapper = self.build_alignment_mapper(accession, accession, "transcript")?; + (mapper.c_to_n(loc_edit.loc.inner())?.into(), accession) + } + HgvsVariant::GenomeVariant { + accession, + loc_edit, + .. + } => (loc_edit.loc.inner().clone().try_into()?, accession), + HgvsVariant::MtVariant { + accession, + loc_edit, + .. + } => (loc_edit.loc.inner().clone().try_into()?, accession), + HgvsVariant::TxVariant { + accession, + loc_edit, + .. + } => (loc_edit.loc.inner().clone().into(), accession), + HgvsVariant::RnaVariant { + accession, + loc_edit, + .. + } => (loc_edit.loc.inner().clone().into(), accession), + _ => panic!("Cases excluded above; cannot happen"), + }; + + // NB: The following comment is from the original code (no strict_bounds there either). + // When strict_bounds is false and an error occurs, return variant as-is. + if r.start < 0 { + // This is an out-of-bounds variant. + return Ok(var); + } + log::debug!("get_seq_part({}, {}, {})", ac, r.start, r.end); + let seq = self.provider.as_ref().get_seq_part( + ac, + Some(r.start as usize), + Some(r.end as usize), + )?; + if seq.len() != r.len() { + // Tried to read beyond seq end; this is an out-of-bounds variant. + log::debug!("Bailing out on out-of-bounds variant: {}", &var); + return Ok(var); + } + + log::debug!("foo"); + + let na_edit = var + .na_edit() + .expect("Variant must be of nucleic acid type here"); + if !na_edit.reference_equals(&seq) { + debug!("Replaced reference sequence in {} with {}", &var, &seq); + Ok(var.with_reference(seq)) + } else { + Ok(var) + } + } + + fn fetch_gene_symbol( + &self, + tx_ac: &str, + gene_symbol: &Option, + ) -> Result, anyhow::Error> { + if !self.config.add_gene_symbol { + Ok(gene_symbol.clone()) + } else if let Some(gene_symbol) = gene_symbol { + Ok(Some(gene_symbol.clone())) + } else { + let hgnc = self.provider.as_ref().get_tx_identity_info(tx_ac)?.hgnc; + if hgnc.is_empty() { + Ok(None) + } else { + Ok(Some(GeneSymbol::from(hgnc))) + } + } + } +} + +#[cfg(test)] +mod test { + use std::{rc::Rc, str::FromStr}; + use test_log::test; + + use crate::{ + data::uta::{Config as ProviderConfig, Provider}, + parser::HgvsVariant, + }; + + use super::{Config, Mapper}; + + fn get_config() -> ProviderConfig { + ProviderConfig { + db_url: std::env::var("TEST_UTA_DATABASE_URL") + .expect("Environment variable TEST_UTA_DATABASE_URL undefined!"), + db_schema: std::env::var("TEST_UTA_DATABASE_SCHEMA") + .expect("Environment variable TEST_UTA_DATABASE_SCHEMA undefined!"), + } + } + + fn build_mapper() -> Result { + let provider = Rc::new(Provider::with_config(&get_config())?); + let config = Config::default(); + Ok(Mapper::new(&config, provider)) + } + + #[test] + fn fail_for_invalid_variant_types() -> Result<(), anyhow::Error> { + let mapper = build_mapper()?; + + let hgvs_g = "NC_000007.13:g.36561662C>T"; + let hgvs_c = "NM_001637.3:c.1582G>A"; // gene AOAH + + let var_g = HgvsVariant::from_str(hgvs_g)?; + let var_c = HgvsVariant::from_str(hgvs_c)?; + + assert!(mapper.g_to_c(&var_c, "NM_001637.3", "splign").is_err()); + assert!(mapper.g_to_t(&var_c, "NM_001637.3", "splign").is_err()); + assert!(mapper.n_to_g(&var_c, "NM_001637.3", "splign").is_err()); + assert!(mapper.c_to_g(&var_g, "NM_001637.3", "splign").is_err()); + assert!(mapper.t_to_g(&var_g, "NM_001637.3", "splign").is_err()); + assert!(mapper.c_to_n(&var_g).is_err()); + assert!(mapper.n_to_c(&var_g).is_err()); + assert!(mapper.c_to_p(&var_g, None).is_err()); + + Ok(()) + } + + #[test] + fn fail_c_to_p_on_invalid_nm_accession() -> Result<(), anyhow::Error> { + let mapper = build_mapper()?; + + let hgvs_g = "NC_000007.13:g.36561662C>T"; + let var_g = HgvsVariant::from_str(hgvs_g)?; + + assert!(mapper.c_to_p(&var_g, Some("NM_999999.1")).is_err()); + + Ok(()) + } + + #[ignore] + #[test] + fn fail_on_undefined_cds() -> Result<(), anyhow::Error> { + let mapper = build_mapper()?; + + let hgvs_n = "NR_111984.1:n.44G>A"; // legit + let hgvs_c = "NR_111984.1:c.44G>A"; // bogus: c. with non-coding tx accession + + let var_n = HgvsVariant::from_str(hgvs_n)?; + let var_c = HgvsVariant::from_str(hgvs_c)?; + let _tx_ac = if let HgvsVariant::TxVariant { accession, .. } = &var_c { + Some(accession.value.clone()) + } else { + None + } + .unwrap(); + + // n_to_c: transcript is non-coding + assert!(mapper.n_to_c(&var_n).is_err()); + + // c_to_n: var_c is bogus + assert!(mapper.c_to_n(&var_c).is_err()); + + Ok(()) + } + + #[ignore] + #[test] + fn map_var_of_unsupported_validation() -> Result<(), anyhow::Error> { + let mapper = build_mapper()?; + let hgvs_c = "NM_003777.3:c.13552_*36del57"; // gene DNAH11 + let var_c = HgvsVariant::from_str(hgvs_c)?; + + let var_g = mapper.c_to_g(&var_c, "NC_000007.13", "splign")?; + assert_eq!(format!("{}", &var_g), "NC_000007.13:g.21940852_21940908del"); + + Ok(()) + } + + #[ignore] + #[test] + fn map_to_unknown_p_effect() -> Result<(), anyhow::Error> { + let mapper = build_mapper()?; + let hgvs_c = "NM_020975.4:c.625+9C>T"; // gene RET + let var_c = HgvsVariant::from_str(hgvs_c)?; + let var_p = mapper.c_to_p(&var_c, None)?; + assert_eq!(format!("{}", &var_p), "NP_066124.1:p.?"); + + Ok(()) + } + + #[ignore] + #[test] + fn map_of_c_out_of_cds_bound() -> Result<(), anyhow::Error> { + let mapper = build_mapper()?; + let hgvs_c = "NM_145901.2:c.343T>C"; // gene HMGA1 + let var_c = HgvsVariant::from_str(hgvs_c)?; + assert!(mapper.c_to_p(&var_c, None).is_err()); + + Ok(()) + } + + #[ignore] + #[test] + fn map_of_dup_at_cds_end() -> Result<(), anyhow::Error> { + let mapper = build_mapper()?; + let hgvs_c = "NM_001051.2:c.1257dupG"; // gene SSTR3 + let var_c = HgvsVariant::from_str(hgvs_c)?; + let var_p = mapper.c_to_p(&var_c, None)?; + assert_eq!(format!("{}", &var_p), "NP_001042.1:p.(=)"); + + Ok(()) + } + + #[ignore] + #[test] + fn map_of_c_out_of_reference_bound() -> Result<(), anyhow::Error> { + let mapper = build_mapper()?; + let hgvs_c = "NM_000249.3:c.-73960_*46597del"; // gene MLH1 + let var_c = HgvsVariant::from_str(hgvs_c)?; + assert!(mapper.c_to_p(&var_c, None).is_err()); + + Ok(()) + } } diff --git a/src/normalizer.rs b/src/normalizer.rs new file mode 100644 index 0000000..f4aa97f --- /dev/null +++ b/src/normalizer.rs @@ -0,0 +1,2234 @@ +//! Variant normalization. + +use std::{cmp::Ordering, ops::Range, rc::Rc}; + +use crate::{ + data::interface::Provider, + mapper::variant::Mapper as VariantMapper, + parser::{ + GenomeInterval, GenomeLocEdit, HgvsVariant, MtInterval, MtLocEdit, Mu, NaEdit, RnaInterval, + RnaLocEdit, RnaPos, TxInterval, TxLocEdit, TxPos, + }, + utils::{revcomp, trim_common_prefixes, trim_common_suffixes}, + validator::Validator, +}; + +/// A direction with respect to a sequence. +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +pub enum Direction { + ThreeToFive, + FiveToThree, +} + +/// Configuration for the normalizer. +#[derive(Debug, Clone)] +pub struct Config { + alt_aln_method: String, + cross_boundaries: bool, + shuffle_direction: Direction, + // TODO: inconsistent with passing in the validator... + #[allow(dead_code)] + validate: bool, + window_size: usize, +} + +impl Default for Config { + fn default() -> Self { + Self { + alt_aln_method: "splign".to_string(), + cross_boundaries: false, + shuffle_direction: Direction::FiveToThree, + validate: true, + window_size: 20, + } + } +} + +/// Normalizes variants (5' and 3' shifting). +pub struct Normalizer<'a> { + pub provider: Rc, + pub validator: Rc, + pub config: Config, + pub mapper: &'a VariantMapper, +} + +/// Helper type used in `Normalizer::check_and_guard()`. +struct CheckAndGuardResult { + var: HgvsVariant, + as_is: bool, + cds_to_tx: bool, +} + +impl<'a> Normalizer<'a> { + pub fn new( + mapper: &'a VariantMapper, + provider: Rc, + validator: Rc, + config: Config, + ) -> Self { + Self { + mapper, + provider, + validator, + config, + } + } + + pub fn normalize(&self, var: &HgvsVariant) -> Result { + log::debug!("normalize({})", var); + // Run the pre-normalization checks (a) whether trying to normalize the variant is an + // error, (b) whether the variant should be returned as is, and (c) whether the variant + // was translated from CDS to transcript and needs translation into the other direction. + let CheckAndGuardResult { + var, + as_is, + cds_to_tx, + } = self.check_and_guard(var)?; + log::debug!("~~~~~"); + if as_is { + log::debug!("short-circuit as-is: {}", &var); + return Ok(var); + } + + // Compute boundary for the shuffling depending on the configuration and normalize + // the alleles. + let boundary = self.get_boundary(&var)?; + let (start, end, reference, alternative) = + self.normalize_alleles(&var, boundary.clone())?; + log::debug!( + "~~ start, end, reference, alternative = {} {} {} {}", + start, + end, + &reference, + &alternative + ); + + // Build a new variant from the given alleles and possibly project back to the CDS. + self.build_result(var, start, end, reference, alternative, boundary, cds_to_tx) + } + + fn check_and_guard( + &self, + orig_var: &HgvsVariant, + ) -> Result { + let var = orig_var.clone(); + self.validator.validate(&var)?; + + // Bail out if we cannot reliably normalize. + if var.mu_na_edit().map(|e| !e.is_certain()).unwrap_or(true) + || var.mu_loc_range().map(|e| !e.is_certain()).unwrap_or(false) + { + return Ok(CheckAndGuardResult { + var, + as_is: true, + cds_to_tx: false, + }); + } + + // Guard against unsupported normalizations. + if let HgvsVariant::ProtVariant { .. } = var { + return Err(anyhow::anyhow!( + "Unsupported normalization of protein level variant: {}", + &var + )); + } + + // NB: once we support gene conversions, guard against this here as well. + + let var = self.mapper.replace_reference(var)?; + + // Guard against identity variants. + if let Some(na_edit) = var.na_edit() { + log::debug!("Short-circuit on identity variant: {:?}", &var); + if format!("{na_edit}") == "=" { + return Ok(CheckAndGuardResult { + var, + as_is: true, + cds_to_tx: false, + }); + } + } + + // For CDS variants, first convert to transcript variatn and perform normalization on this + // level. Then, convert normalized variant back. + let (var, cds_to_tx) = if let HgvsVariant::CdsVariant { .. } = var { + (self.mapper.c_to_n(&var)?, true) + } else { + (var, false) + }; + + // Guard against intronic variants. + match &var { + HgvsVariant::TxVariant { .. } | HgvsVariant::RnaVariant { .. } => { + if var.spans_intron() { + Err(anyhow::anyhow!( + "Normalization of intronic variants is not supported" + )) + } else { + Ok(()) + } + } + _ => Ok(()), + }?; + + fn valid_seq_len( + provider: &dyn Provider, + ac: &str, + len: usize, + ) -> Result { + log::debug!("get_seq_part({}, {}, {})", ac, len - 1, len); + let res = provider.get_seq_part(ac, Some(len - 1), Some(len))?; + log::debug!("== {}", res); + Ok(!res.is_empty()) + } + + log::debug!("+++"); + // Bail out when coordinates are out of bounds. + if let Some(var_loc_range) = var.loc_range() { + if var_loc_range.start < 0 + || !valid_seq_len( + self.provider.as_ref(), + var.accession(), + var_loc_range.end as usize, + )? + { + return Err(anyhow::anyhow!("Coordinates are out of bound: {}", var)); + } + } else { + panic!("Cannot happen; guarded against above") + } + + // Assertion against any unsupported variant type. + assert!( + matches!( + &var, + HgvsVariant::GenomeVariant { .. } + | HgvsVariant::MtVariant { .. } + | HgvsVariant::TxVariant { .. } + | HgvsVariant::RnaVariant { .. } + ), + "Variant must have one of the types gmnr" + ); + log::debug!("###"); + + Ok(CheckAndGuardResult { + var, + as_is: false, + cds_to_tx, + }) + } + + /// Obtain position of exon-intron boundary for the current variant. + fn get_boundary(&self, var: &HgvsVariant) -> Result, anyhow::Error> { + log::debug!("get_boundary({})", var); + if !self.config.cross_boundaries + && matches!( + &var, + HgvsVariant::RnaVariant { .. } | HgvsVariant::TxVariant { .. } + ) + { + // Obtain genomic accession. + let map_info = self + .provider + .as_ref() + .get_tx_mapping_options(var.accession())?; + let map_info = map_info + .into_iter() + .filter(|r| r.alt_aln_method == self.config.alt_aln_method) + .collect::>(); + log::debug!("map_info = {:?}", &map_info); + let alt_ac = &map_info[0].alt_ac; + + // Obtain tx info. + let tx_info = self.provider.as_ref().get_tx_info( + var.accession(), + alt_ac, + &self.config.alt_aln_method, + )?; + let cds_start = tx_info.cds_start_i; + let cds_end = tx_info.cds_end_i; + + // Obtain exon info. + let exon_info = self.provider.as_ref().get_tx_exons( + var.accession(), + alt_ac, + &self.config.alt_aln_method, + )?; + log::debug!("exon_info = {:?}", &map_info); + let mut exon_starts = exon_info.iter().map(|r| r.tx_start_i).collect::>(); + exon_starts.sort(); + let mut exon_ends = exon_info.iter().map(|r| r.tx_end_i).collect::>(); + exon_ends.sort(); + exon_starts.push(*exon_ends.last().unwrap()); + exon_ends.push(std::i32::MAX); + log::debug!("exon_starts = {:?}", &exon_starts); + log::debug!("exon_ends = {:?}", &exon_ends); + + // Find the end pos of the exon where the var locates. + let _left = 0; + let _right = std::i32::MAX; + + // NB: the following content is from the original Python code. + // TODO: #242: implement methods to find tx regions + let loc_range = var.loc_range().unwrap(); + log::debug!("var = {}, loc_range = {:?}", &var, var.loc_range()); + let i = exon_starts + .iter() + .enumerate() + .find(|(idx, _x)| { + loc_range.start >= exon_starts[*idx] && loc_range.start < exon_ends[*idx] + }) + .unwrap() + .0; + let j = exon_starts + .iter() + .enumerate() + .find(|(idx, _x)| { + loc_range.end > exon_starts[*idx] && loc_range.end - 1 < exon_ends[*idx] + }) + .unwrap() + .0; + + log::debug!("i = {}", i); + + if i != j { + return Err(anyhow::anyhow!( + "Unsupported normalization of variants spanning the exon-intron boundary: {}", + var + )); + } + + let mut left = exon_starts[i]; + let mut right = exon_ends[i]; + + if let Some(cds_start) = cds_start { + if loc_range.end - 1 < cds_start { + right = std::cmp::min(right, cds_start); + } else if loc_range.start >= cds_start { + left = std::cmp::max(left, cds_start); + } else { + return Err(anyhow::anyhow!( + "Unsupported normalization of variants spanning the UTR-exon boundary: {}", + var + )); + } + } + + if let Some(cds_end) = cds_end { + if loc_range.start >= cds_end { + left = std::cmp::max(left, cds_end); + } else if loc_range.end - 1 < cds_end { + right = std::cmp::min(right, cds_end); + } else { + return Err(anyhow::anyhow!( + "Unsupported normalization of variants spanning the UTR-exon boundary: {}", + var + )); + } + } + + Ok(left..right) + } else { + // For variant types g., m., etc. + Ok(0..std::i32::MAX) + } + } + + /// NB: The returned start/end are 1-based! + fn normalize_alleles( + &self, + var: &HgvsVariant, + boundary: Range, + ) -> Result<(i32, i32, String, String), anyhow::Error> { + let (reference, alternative) = self.get_ref_alt(var, &boundary)?; + let win_size = self.config.window_size; + + log::debug!( + "normalize_alleles({}, {}, {}, {}, {:?}) config={:?}", + &reference, + &alternative, + win_size, + var, + &boundary, + &self.config + ); + + if self.config.shuffle_direction == Direction::FiveToThree { + self.normalize_alleles_5_to_3( + reference, + alternative, + win_size.try_into()?, + var, + boundary, + ) + } else { + self.normalize_alleles_3_to_5( + reference, + alternative, + win_size.try_into()?, + var, + boundary, + ) + } + } + + fn normalize_alleles_5_to_3( + &self, + reference: String, + alternative: String, + win_size: i32, + var: &HgvsVariant, + boundary: Range, + ) -> Result<(i32, i32, String, String), anyhow::Error> { + log::debug!( + "normalize_alleles_5_to_3({}, {}, {}, {}, {:?}) config={:?}", + &reference, + &alternative, + win_size, + var, + &boundary, + &self.config + ); + + let mut reference = reference; + let mut alternative = alternative; + let loc_range = var.loc_range().unwrap(); + log::debug!("loc_rang e= {:?}", &loc_range); + let (mut base, mut start, mut stop) = match var.na_edit().unwrap() { + NaEdit::Ins { .. } => (loc_range.start + 1, 1, 1), + NaEdit::Dup { .. } => (loc_range.end, 1, 1), + _ => (loc_range.start + 1, 0, loc_range.end - loc_range.start), + }; + + loop { + let ref_seq = self.fetch_bounded_seq( + var, + base - 1, + base + stop - 1 + win_size, + win_size, + &boundary, + )?; + log::debug!( + "base = {}, stop = {}, win_size = {}, boundary = {:?}", + base, + stop, + win_size, + &boundary + ); + log::debug!("ref_seq = {}", &ref_seq); + if ref_seq.is_empty() { + break; + } + let (orig_start, _orig_stop) = (start, stop); + (start, stop, reference, alternative) = normalize_alleles( + &ref_seq, + start, + stop, + reference, + alternative, + ref_seq.len(), + win_size, + false, + )?; + log::debug!( + "start, stop, reference, alternative = {}, {}, {}, {}", + start, + stop, + &reference, + &alternative + ); + if stop < ref_seq.len().try_into()? || start == orig_start { + break; + } + // If stop at the end of the window, try to extend teh shuffling to the right. + base += start - orig_start; + stop -= start - orig_start; + start = orig_start; + } + + Ok((base + start, base + stop, reference, alternative)) + } + + fn normalize_alleles_3_to_5( + &self, + reference: String, + alternative: String, + win_size: i32, + var: &HgvsVariant, + boundary: Range, + ) -> Result<(i32, i32, String, String), anyhow::Error> { + log::debug!( + "normalize_alleles_3_to_5({}, {}, {}, {}, {:?}) config={:?}", + &reference, + &alternative, + win_size, + var, + &boundary, + &self.config + ); + let mut reference = reference; + let mut alternative = alternative; + let loc_range = var.loc_range().unwrap(); + log::debug!("loc_range = {:?}", &loc_range); + let mut base = std::cmp::max(loc_range.start + 1 - win_size, 1); + let (mut start, mut stop) = match var.na_edit().unwrap() { + NaEdit::Ins { .. } => (loc_range.end - base, loc_range.end - base), + NaEdit::Dup { .. } => (loc_range.end - base + 1, loc_range.end - base + 1), + _ => (loc_range.start + 1 - base, loc_range.end - base + 1), + }; + + loop { + log::debug!( + "base, boundary, start, stop = {} {:?} {} {}", + base, + &boundary, + start, + stop + ); + if base < boundary.start + 1 { + start -= boundary.start + 1 - base; + stop -= boundary.start + 1 - base; + base = boundary.start + 1; + } + log::debug!( + "..base, boundary, start, stop = {} {:?} {} {}", + base, + &boundary, + start, + stop + ); + let ref_seq = + self.fetch_bounded_seq(var, base - 1, base + stop - 1, start, &boundary)?; + log::debug!("ref_seq = {}", &ref_seq); + if ref_seq.is_empty() { + break; + } + let (_orig_start, orig_stop) = (start, stop); + (start, stop, reference, alternative) = normalize_alleles( + &ref_seq, + start, + stop, + reference, + alternative, + 0, + win_size, + true, + )?; + if start > 0 || stop == orig_stop { + break; + } + // If stop at the end of the window, try to extend the shuffling to the left. + base -= orig_stop - stop; + start += orig_stop - stop; + stop = orig_stop; + } + Ok((base + start, base + stop, reference, alternative)) + } + + /// NB: The parameter start/end are 1-based! + #[allow(clippy::too_many_arguments)] + fn build_result( + &self, + var: HgvsVariant, + start: i32, + end: i32, + reference: String, + alternative: String, + boundary: Range, + cds_to_tx: bool, + ) -> Result { + let ref_len = reference.len() as i32; + let alt_len = alternative.len() as i32; + + log::debug!("alt_len = {}, ref_len = {}", alt_len, ref_len); + + let (ref_start, ref_end, edit) = match alt_len.cmp(&ref_len) { + Ordering::Equal => { + self.build_result_len_eq(start, end, ref_len, &reference, &alternative, alt_len) + } + Ordering::Less => { + self.build_result_len_less(start, end, alt_len, &reference, &alternative) + } + Ordering::Greater => self.build_result_len_gt( + ref_len, + &var, + start, + alt_len, + end, + &boundary, + &alternative, + &reference, + )?, + }; + + log::debug!( + "ref_start = {}, ref_end = {}, edit = {:?}", + ref_start, + ref_end, + &edit + ); + + log::debug!( + "ref_start, ref_end, edit = {} {} {}", + ref_start, + ref_end, + edit + ); + + // Ensure the start is not 0. + let (ref_start, ref_end, edit, _reference, alternative) = if ref_start == 0 { + log::debug!("Prevent start == 0"); + let reference = self.fetch_bounded_seq(&var, 0, 1, 0, &boundary)?; + let alternative = format!("{}{}", alternative, &reference); + + ( + 1, + 1, + NaEdit::RefAlt { + reference: reference.clone(), + alternative: alternative.clone(), + }, + reference, + alternative, + ) + } else { + (ref_start, ref_end, edit, reference, alternative) + }; + + // Ensure the end is not outside of reference sequence. + let tgt_len = self.get_tgt_len(&var)?; + log::debug!("tgt_len = {}", tgt_len); + let (ref_start, ref_end, edit) = if ref_end == tgt_len.saturating_add(1) { + log::debug!("Prevent end outside of reference sequence"); + let reference = self.fetch_bounded_seq(&var, tgt_len - 1, tgt_len, 0, &boundary)?; + let alternative = format!("{}{}", &reference, alternative); + ( + tgt_len, + tgt_len, + NaEdit::RefAlt { + reference, + alternative, + }, + ) + } else { + (ref_start, ref_end, edit) + }; + + let res = self.build_result_construct(var, ref_start, ref_end, edit, cds_to_tx)?; + log::debug!("result = {:?}", &res); + Ok(res) + } + + fn build_result_construct( + &self, + var: HgvsVariant, + ref_start: i32, + ref_end: i32, + edit: NaEdit, + cds_to_tx: bool, + ) -> Result { + log::debug!("build_result_construct({:?})", &var); + Ok(match var { + HgvsVariant::GenomeVariant { + accession, + gene_symbol, + .. + } => HgvsVariant::GenomeVariant { + accession, + gene_symbol, + loc_edit: GenomeLocEdit { + loc: Mu::Certain(GenomeInterval { + start: Some(ref_start), + end: Some(ref_end), + }), + edit: Mu::Certain(edit), + }, + }, + HgvsVariant::MtVariant { + accession, + gene_symbol, + .. + } => HgvsVariant::MtVariant { + accession, + gene_symbol, + loc_edit: MtLocEdit { + loc: Mu::Certain(MtInterval { + start: Some(ref_start), + end: Some(ref_end), + }), + edit: Mu::Certain(edit), + }, + }, + HgvsVariant::TxVariant { + accession, + gene_symbol, + .. + } => { + let var_t = HgvsVariant::TxVariant { + accession, + gene_symbol, + loc_edit: TxLocEdit { + loc: Mu::Certain(TxInterval { + start: TxPos { + base: ref_start, + offset: None, + }, + end: TxPos { + base: ref_end, + offset: None, + }, + }), + edit: Mu::Certain(edit), + }, + }; + + log::debug!("var_t = {}", &var_t); + + if cds_to_tx { + self.mapper.n_to_c(&var_t)? + } else { + var_t + } + } + HgvsVariant::RnaVariant { + accession, + gene_symbol, + .. + } => HgvsVariant::RnaVariant { + accession, + gene_symbol, + loc_edit: RnaLocEdit { + loc: Mu::Certain(RnaInterval { + start: RnaPos { + base: ref_start, + offset: None, + }, + end: RnaPos { + base: ref_end, + offset: None, + }, + }), + edit: Mu::Certain(edit), + }, + }, + _ => panic!("Cannot happen; variant types guarded above"), + }) + } + + #[allow(clippy::too_many_arguments)] + fn build_result_len_gt( + &self, + ref_len: i32, + var: &HgvsVariant, + start: i32, + alt_len: i32, + end: i32, + boundary: &Range, + alternative: &String, + reference: &String, + ) -> Result<(i32, i32, NaEdit), anyhow::Error> { + Ok(if ref_len == 0 { + log::debug!("build_result_len_gt(ref_len={}, var={}, start={}, alt_len={}, end={}, boundary={:?}, {}, {})", ref_len, var, start, alt_len, end, boundary, alternative, reference); + let adj_seq = if self.config.shuffle_direction == Direction::FiveToThree { + self.fetch_bounded_seq(var, start - alt_len - 1, end - 1, 0, boundary)? + } else { + self.fetch_bounded_seq(var, start - 1, start + alt_len - 1, 0, boundary)? + }; + log::debug!("alternative, adj_seq = {}, {}", &alternative, &adj_seq); + + if *alternative != adj_seq { + // ins + ( + start - 1, + end, + NaEdit::Ins { + alternative: alternative.clone(), + }, + ) + } else { + // dup + if self.config.shuffle_direction == Direction::FiveToThree { + ( + start - alt_len, + end - 1, + NaEdit::Dup { + reference: alternative.clone(), + }, + ) + } else { + ( + start, + start + alt_len - 1, + NaEdit::Dup { + reference: alternative.clone(), + }, + ) + } + } + } else { + // delins + ( + start, + end - 1, + NaEdit::RefAlt { + reference: reference.clone(), + alternative: alternative.clone(), + }, + ) + }) + } + + /// Build the result for the del/delins case. + fn build_result_len_less( + &self, + start: i32, + end: i32, + alt_len: i32, + reference: &str, + alternative: &str, + ) -> (i32, i32, NaEdit) { + ( + start, + end - 1, + if alt_len == 0 { + NaEdit::DelRef { + reference: reference.to_owned(), + } + } else { + NaEdit::RefAlt { + reference: reference.to_owned(), + alternative: alternative.to_owned(), + } + }, + ) + } + + /// Build the result for the case of ref.len() == alt.len() + fn build_result_len_eq( + &self, + start: i32, + end: i32, + ref_len: i32, + reference: &str, + alternative: &str, + alt_len: i32, + ) -> (i32, i32, NaEdit) { + log::debug!("Ordering::Equal"); + let ref_start = start; + let ref_end = end - 1; + + if ref_len > 1 && *reference == revcomp(alternative) { + log::debug!("inversion"); + // inversion + ( + ref_start, + ref_end, + NaEdit::InvRef { + reference: reference.to_owned(), + }, + ) + } else if ref_len == 0 && alt_len == 0 { + log::debug!("identity"); + // identity + ( + ref_end, + ref_end, + NaEdit::RefAlt { + reference: reference.to_owned(), + alternative: alternative.to_owned(), + }, + ) + } else { + log::debug!("subs or delins"); + // substitution or delins + ( + ref_start, + ref_end, + if alt_len == 0 { + NaEdit::DelRef { + reference: reference.to_owned(), + } + } else { + NaEdit::RefAlt { + reference: reference.to_owned(), + alternative: alternative.to_owned(), + } + }, + ) + } + } + + /// Fetch reference sequence from HGVS data provider. + /// + /// The start position is 0 and the interval is half-open. + fn fetch_bounded_seq( + &self, + var: &HgvsVariant, + start: i32, + end: i32, + window_size: i32, + boundary: &Range, + ) -> Result { + log::debug!( + "fetch_bounded_seq(var={}, start={}, end={}, window_size={}, boundary={:?})", + var, + start, + end, + window_size, + &boundary + ); + let var_len = end - start - window_size; + + let start = std::cmp::max(start, boundary.start); + let end = std::cmp::min(end, boundary.end); + if start >= end { + return Ok("".to_string()); + } + + let seq = self.provider.get_seq_part( + var.accession(), + Some(start.try_into()?), + Some(end.try_into()?), + )?; + log::debug!( + "get_seq_part({}, {}, {}) = {}", + var.accession(), + start, + end, + &seq + ); + let seq_len: i32 = seq.len().try_into()?; + + if seq_len < end - start && seq_len < var_len { + Err(anyhow::anyhow!( + "Variant span is outside of sequence bounds: {}", + &var + )) + } else { + Ok(seq) + } + } + + fn get_tgt_len(&self, var: &HgvsVariant) -> Result { + Ok( + if matches!( + var, + HgvsVariant::GenomeVariant { .. } | HgvsVariant::MtVariant { .. } + ) { + std::i32::MAX + } else { + let id_info = self.provider.get_tx_identity_info(var.accession())?; + log::debug!("id_info = {:?}", &id_info); + id_info.lengths.into_iter().sum() + }, + ) + } + + fn get_ref_alt( + &self, + var: &HgvsVariant, + boundary: &Range, + ) -> Result<(String, String), anyhow::Error> { + // Get reference allele. + let reference = match var.na_edit().unwrap() { + NaEdit::Ins { .. } | NaEdit::Dup { .. } => "".to_string(), + NaEdit::RefAlt { .. } | NaEdit::InvRef { .. } | NaEdit::DelRef { .. } => { + let loc_range = var.loc_range().unwrap(); + self.fetch_bounded_seq(var, loc_range.start, loc_range.end, 0, boundary)? + } + _ => panic!("Cannot work with NumAlt,DelNum/InvNum"), + }; + + // Get alternative allele. + let alternative = match var.na_edit().unwrap() { + NaEdit::DelRef { .. } => "".to_string(), + NaEdit::RefAlt { alternative, .. } | NaEdit::Ins { alternative } => { + alternative.to_string() + } + NaEdit::Dup { .. } => { + let loc_range = var.loc_range().unwrap(); + self.fetch_bounded_seq(var, loc_range.start, loc_range.end, 0, boundary)? + } + NaEdit::InvRef { .. } => revcomp(&reference), + _ => panic!("Cannot work with NumAlt,DelNum/InvNum"), + }; + + log::debug!( + "get_ref_alt({}, {:?}, {}, {})", + &var, + boundary, + &reference, + &alternative + ); + + Ok((reference, alternative)) + } +} + +#[allow(clippy::too_many_arguments)] +fn normalize_alleles( + ref_seq: &str, + start: i32, + stop: i32, + reference: String, + alternative: String, + bound: usize, + ref_step: i32, + left: bool, +) -> Result<(i32, i32, String, String), anyhow::Error> { + log::debug!( + "normalize_alleles({}, {}, {}, {}, {}, {}, {}, {})", + ref_seq, + start, + stop, + &reference, + &alternative, + bound, + ref_step, + left + ); + if left { + normalize_alleles_left( + ref_seq, + start.try_into()?, + stop.try_into()?, + reference, + alternative, + bound, + ref_step.try_into()?, + ) + } else { + normalize_alleles_right( + ref_seq, + start.try_into()?, + stop.try_into()?, + reference, + alternative, + bound, + ref_step.try_into()?, + ) + } +} + +fn normalize_alleles_left( + ref_seq: &str, + start: usize, + stop: usize, + reference: String, + alternative: String, + bound: usize, + ref_step: usize, +) -> Result<(i32, i32, String, String), anyhow::Error> { + // Step 1: Trim common suffix./ + let (trimmed, reference, alternative) = trim_common_suffixes(&reference, &alternative); + let mut stop = stop - trimmed; + log::debug!( + "after suffix trimming {} {:?} {:?} {}", + trimmed, + &reference, + &alternative, + stop + ); + + // Step 2: Trim common prefix. + let (trimmed, mut reference, mut alternative) = trim_common_prefixes(&reference, &alternative); + let mut start = start + trimmed; + log::debug!( + "after suffix trimming {} {:?} {:?} {}", + trimmed, + &reference, + &alternative, + start + ); + + // Step 3: While a null allele exists, right shuffle by appending alleles with reference + // and trimming common prefixes. + + let shuffle = true; + log::debug!( + "shuffle, reference, alternative, start, bound = {} {} {} {} {}", + shuffle, + &reference, + &alternative, + stop, + bound + ); + while shuffle && (reference.is_empty() || alternative.is_empty()) && start > bound { + let step = std::cmp::min(ref_step, start - bound); + + let r = ref_seq[(start - step)..(start - bound)].to_uppercase(); + log::debug!("r = {}", &r); + let new_reference = format!("{r}{reference}"); + let new_alternative = format!("{r}{alternative}"); + + let (trimmed, new_reference, new_alternative) = + trim_common_suffixes(&new_reference, &new_alternative); + log::debug!( + "trimmed = {}, new_reference = {}, new_alternative = {}", + trimmed, + &new_reference, + &new_alternative + ); + + if trimmed == 0 { + break; + } + + start -= trimmed; + stop -= trimmed; + + if trimmed == step { + reference = new_reference; + alternative = new_alternative; + } else { + let left = step - trimmed; + let r = left..; + reference = new_reference[r.clone()].to_string(); + alternative = new_alternative[r.clone()].to_string(); + log::debug!( + "break -- left, r, reference, alternative = {} {:?} {} {}", + left, + &r, + &reference, + &alternative + ); + break; + } + + log::debug!( + "~~shuffle, reference, alternative, start, bound = {} {} {} {} {}", + shuffle, + &reference, + &alternative, + stop, + bound + ); + } + + Ok((start.try_into()?, stop.try_into()?, reference, alternative)) +} + +fn normalize_alleles_right( + ref_seq: &str, + start: usize, + stop: usize, + reference: String, + alternative: String, + bound: usize, + ref_step: usize, +) -> Result<(i32, i32, String, String), anyhow::Error> { + log::debug!("normalize_alleles_right"); + // Step 1: Trim common prefix. + let (trimmed, reference, alternative) = trim_common_prefixes(&reference, &alternative); + let mut start = start + trimmed; + log::debug!( + "after prefix trimming {} {} {} {}", + trimmed, + &reference, + &alternative, + start + ); + + // Step 2: Trim common suffix. + let (trimmed, mut reference, mut alternative) = trim_common_suffixes(&reference, &alternative); + let mut stop = stop - trimmed; + log::debug!( + "after suffix trimming {} {} {} {}", + trimmed, + &reference, + &alternative, + stop + ); + + // Step 3: While a null allele exists, right shuffle by appending alleles with reference + // and trimming common prefixes. + + let shuffle = true; + log::debug!( + "shuffle, reference, alternative, start, bound = {} {} {} {} {}", + shuffle, + &reference, + &alternative, + stop, + bound + ); + while shuffle && (reference.is_empty() || alternative.is_empty()) && stop < bound { + let step = std::cmp::min(ref_step, bound - stop); + + let r = ref_seq[stop..(stop + step)].to_uppercase(); + log::debug!("r = {}", &r); + let new_reference = format!("{reference}{r}"); + let new_alternative = format!("{alternative}{r}"); + + let (trimmed, new_reference, new_alternative) = + trim_common_prefixes(&new_reference, &new_alternative); + + log::debug!( + "trimmed = {}, new_reference = {}, new_alternative = {}", + trimmed, + &new_reference, + &new_alternative + ); + + if trimmed == 0 { + break; + } + + start += trimmed; + stop += trimmed; + + if trimmed == step { + reference = new_reference; + alternative = new_alternative; + } else { + let left = step - trimmed; + let r = ..(new_reference.len() - left); + reference = new_reference[r].to_string(); + let r = ..(new_alternative.len() - left); + alternative = new_alternative[r].to_string(); + log::debug!( + "break -- left, r, reference, alternative = {} {:?} {} {}", + left, + &r, + &reference, + &alternative + ); + break; + } + log::debug!( + "shuffle, reference, alternative, start, bound = {} {} {} {} {}", + shuffle, + &reference, + &alternative, + stop, + bound + ); + } + + Ok((start.try_into()?, stop.try_into()?, reference, alternative)) +} + +#[cfg(test)] +mod test { + use test_log::test; + + use std::{rc::Rc, str::FromStr}; + + use pretty_assertions::assert_eq; + + use super::{Config, Direction, Normalizer}; + use crate::{ + data::uta_sr::test_helpers::build_provider, + mapper::variant::Mapper, + parser::{HgvsVariant, NoRef}, + validator::IntrinsicValidator, + }; + + fn normalizers( + mapper: &Mapper, + ) -> Result<(Normalizer, Normalizer, Normalizer, Normalizer), anyhow::Error> { + let provider = mapper.provider(); + let validator = Rc::new(IntrinsicValidator::new(true)); + + Ok(( + Normalizer::new( + mapper, + provider.clone(), + validator.clone(), + Config { + shuffle_direction: Direction::FiveToThree, + cross_boundaries: true, + ..Default::default() + }, + ), + Normalizer::new( + mapper, + provider.clone(), + validator.clone(), + Config { + shuffle_direction: Direction::ThreeToFive, + cross_boundaries: true, + ..Default::default() + }, + ), + Normalizer::new( + mapper, + provider.clone(), + validator.clone(), + Config { + shuffle_direction: Direction::FiveToThree, + cross_boundaries: false, + ..Default::default() + }, + ), + Normalizer::new( + mapper, + provider.clone(), + validator, + Config { + shuffle_direction: Direction::ThreeToFive, + cross_boundaries: false, + ..Default::default() + }, + ), + )) + } + + #[test] + fn normalize_cds_3_prime_shuffling() -> Result<(), anyhow::Error> { + let mapper = Mapper::new(&Default::default(), build_provider()?); + let (norm, _norm5, _normc, _norm5c) = normalizers(&mapper)?; + + // The following tests correspond to the Python ones "3' shuffling" and "5' shuffling". + + let cases3 = vec![ + // gene COL1A + ("NM_000088.3:c.589_600inv", "NM_000088.3:c.590_599inv"), + // gene DEFB133 + ("NM_001166478.1:c.31del", "NM_001166478.1:c.35del"), + ("NM_001166478.1:c.35_36insT", "NM_001166478.1:c.35dup"), + ("NM_001166478.1:c.36_37insTC", "NM_001166478.1:c.36_37dup"), + ("NM_001166478.1:c.35_36dup", "NM_001166478.1:c.36_37dup"), + ( + "NM_001166478.1:c.2_7delinsTTTAGA", + "NM_001166478.1:c.3_4delinsTT", + ), + ("NM_001166478.1:c.30_31insT", "NM_001166478.1:c.35dup"), + ("NM_001166478.1:c.59delG", "NM_001166478.1:c.61del"), + ( + "NM_001166478.1:c.36_37insTCTCTC", + "NM_001166478.1:c.37_38insCTCTCT", + ), + // gene ATM + ("NM_000051.3:c.14_15insT", "NM_000051.3:c.15dup"), + ]; + + for (input, exp_3) in cases3 { + log::info!("3' test case: ({}, {})", &input, &exp_3); + let raw = HgvsVariant::from_str(input)?; + + // 5' -> 3' shuffling + let res_3 = norm.normalize(&raw)?; + assert_eq!( + format!("{}", &NoRef(&res_3)), + exp_3, + "{:?} ~ {:?} ~ {:?}", + &res_3, + &raw, + &exp_3 + ); + } + + Ok(()) + } + + #[test] + fn normalize_cds_5_prime_shuffling() -> Result<(), anyhow::Error> { + let mapper = Mapper::new(&Default::default(), build_provider()?); + let (_norm, norm5, _normc, _norm5c) = normalizers(&mapper)?; + + let cases5 = vec![ + // gene COL1A + ("NM_000088.3:c.589_600inv", "NM_000088.3:c.590_599inv"), + // gene DEFB133 + ("NM_001166478.1:c.34del", "NM_001166478.1:c.31del"), + ("NM_001166478.1:c.35_36insT", "NM_001166478.1:c.31dup"), + ("NM_001166478.1:c.36_37insTC", "NM_001166478.1:c.35_36dup"), + ("NM_001166478.1:c.35_36dup", "NM_001166478.1:c.35_36dup"), + ( + "NM_001166478.1:c.2_7delinsTTTAGA", + "NM_001166478.1:c.3_4delinsTT", + ), + ("NM_001166478.1:c.30_31insT", "NM_001166478.1:c.31dup"), + ("NM_001166478.1:c.61delG", "NM_001166478.1:c.59del"), + ( + "NM_001166478.1:c.36_37insTCTCTC", + "NM_001166478.1:c.34_35insTCTCTC", + ), + // gene ATM + ("NM_000051.3:c.14_15insT", "NM_000051.3:c.14dup"), + ]; + + for (input, exp_5) in cases5 { + log::info!("5' test case: ({}, {})", &input, &exp_5); + let raw = HgvsVariant::from_str(input)?; + + // 3' -> 5' shuffling + let res_5 = norm5.normalize(&raw)?; + assert_eq!( + format!("{}", &NoRef(&res_5)), + exp_5, + "{:?} ~ {:?} ~ {:?}", + &res_5, + &raw, + &exp_5 + ); + } + + Ok(()) + } + + #[test] + fn normalize_cds_around_exon_intron_boundary() -> Result<(), anyhow::Error> { + let mapper = Mapper::new(&Default::default(), build_provider()?); + let (_norm, _norm5, normc, norm5c) = normalizers(&mapper)?; + + // The following tests correspond to the Python ones "Around exon-intron boundary". + { + let raw = HgvsVariant::from_str("NM_001166478.1:c.59delG")?; + let exp = "NM_001166478.1:c.60del"; + let res = normc.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + { + let raw = HgvsVariant::from_str("NM_001166478.1:c.61delG")?; + let exp = "NM_001166478.1:c.61del"; + let res = norm5c.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + { + // gene MECP2 + let raw = HgvsVariant::from_str("NM_001110792.1:c.1030_1035del")?; + let exp = "NM_001110792.1:c.1029_1034del"; + let res = norm5c.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + { + let raw = HgvsVariant::from_str("NM_001166478.1:c.59_61del")?; + // with self.assertRaises(HGVSUnsupportedOperationError): + assert!(normc.normalize(&raw).is_err()); + } + Ok(()) + } + + #[test] + fn normalize_cds_utr_variant() -> Result<(), anyhow::Error> { + let mapper = Mapper::new(&Default::default(), build_provider()?); + let (norm, norm5, normc, _norm5c) = normalizers(&mapper)?; + + // The following tests correspond to the Python ones "UTR variants". + { + let raw = HgvsVariant::from_str("NM_000051.3:c.-5_-4insA")?; + let exp = "NM_000051.3:c.-3dup"; + let res = norm.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + { + let raw = HgvsVariant::from_str("NM_000051.3:c.-4_-3insAC")?; + let exp = "NM_000051.3:c.-3_-2dup"; + let res = norm.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + { + let raw = HgvsVariant::from_str("NM_000051.3:c.-2_-1insCA")?; + let exp = "NM_000051.3:c.-1_1dup"; + let res = norm.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + + { + let raw = HgvsVariant::from_str("NM_000051.3:c.-2_-1insCA")?; + let exp = "NM_000051.3:c.-1_1insAC"; + let res = normc.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + + { + let raw = HgvsVariant::from_str("NM_000051.3:c.-4_-3insA")?; + let exp = "NM_000051.3:c.-4dup"; + let res = norm5.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + { + let raw = HgvsVariant::from_str("NM_000051.3:c.1_2insCA")?; + let exp = "NM_000051.3:c.-1_1dup"; + let res = norm5.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + + { + let raw = HgvsVariant::from_str("NM_000051.3:c.*2_*3insT")?; + let exp = "NM_000051.3:c.*4dup"; + let res = norm.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + { + let raw = HgvsVariant::from_str("NM_000051.3:c.9170_9171insAT")?; + let exp = "NM_000051.3:c.9171_*1dup"; + let res = norm.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + + { + let raw = HgvsVariant::from_str("NM_000051.3:c.*4_*5insT")?; + let exp = "NM_000051.3:c.*3dup"; + let res = norm5.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + { + let raw = HgvsVariant::from_str("NM_000051.3:c.9171_*1insA")?; + let exp = "NM_000051.3:c.9171dup"; + let res = norm5.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + + { + // gene BRCA2 + let raw = HgvsVariant::from_str("NM_000059.3:c.7790delAAG")?; + // with self.assertRaises(HGVSInvalidVariantError): + assert!(norm.normalize(&raw).is_err()); + } + + Ok(()) + } + + #[test] + fn normalize_genome_3_prime_shuffling() -> Result<(), anyhow::Error> { + let mapper = Mapper::new(&Default::default(), build_provider()?); + let (norm, _norm5, _normc, _norm5c) = normalizers(&mapper)?; + + // 3' shuffling + { + // GRCh38:chr6 + let raw = HgvsVariant::from_str("NC_000006.11:g.49917122_49917123insA")?; + let exp = "NC_000006.11:g.49917127dup"; + let res = norm.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + { + let raw = HgvsVariant::from_str("NC_000006.11:g.49917121_49917122insGA")?; + let exp = "NC_000006.11:g.49917122_49917123dup"; + let res = norm.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + { + let raw = HgvsVariant::from_str("NC_000006.11:g.49917122_49917123dup")?; + let exp = "NC_000006.11:g.49917122_49917123dup"; + let res = norm.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + { + let raw = HgvsVariant::from_str("NC_000006.11:g.49917122_49917123dupGA")?; + let exp = "NC_000006.11:g.49917122_49917123dup"; + let res = norm.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + { + let raw = HgvsVariant::from_str("NC_000006.11:g.49917098delC")?; + let exp = "NC_000006.11:g.49917099del"; + let res = norm.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + { + let raw = HgvsVariant::from_str("NC_000006.11:g.49917151_49917156delinsTCTAAA")?; + let exp = "NC_000006.11:g.49917154_49917155delinsAA"; + let res = norm.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + { + // GRCh37.p13:chr1 + let raw = HgvsVariant::from_str("NC_000001.10:g.1647893delinsCTTTCTT")?; + let exp = "NC_000001.10:g.1647895_1647900dup"; + let res = norm.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + { + // GRCh38.p12:chr3 + let raw = HgvsVariant::from_str( + "NC_000003.12:g.46709584_46709610del27insAAGAAGAAGAAGAAGAAGAAGAAGAAG", + )?; + let exp = "NC_000003.12:g.46709584_46709610="; + let res = norm.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + + Ok(()) + } + + #[test] + fn normalize_genome_5_prime_shuffling() -> Result<(), anyhow::Error> { + let mapper = Mapper::new(&Default::default(), build_provider()?); + let (norm, norm5, _normc, _norm5c) = normalizers(&mapper)?; + + // 5' shuffling + { + let raw = HgvsVariant::from_str("NC_000006.11:g.49917122_49917123insA")?; + let exp = "NC_000006.11:g.49917123dup"; + let res = norm5.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + { + let raw = HgvsVariant::from_str("NC_000006.11:g.49917121_49917122insGA")?; + let exp = "NC_000006.11:g.49917121_49917122dup"; + let res = norm5.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + { + let raw = HgvsVariant::from_str("NC_000006.11:g.49917122_49917123dup")?; + let exp = "NC_000006.11:g.49917121_49917122dup"; + let res = norm5.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + { + let raw = HgvsVariant::from_str("NC_000006.11:g.49917122_49917123dupGA")?; + let exp = "NC_000006.11:g.49917121_49917122dup"; + let res = norm5.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + { + let raw = HgvsVariant::from_str("NC_000006.11:g.49917099delC")?; + let exp = "NC_000006.11:g.49917098del"; + let res = norm5.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + { + let raw = HgvsVariant::from_str("NC_000006.11:g.49917151_49917156delinsTCTAAA")?; + let exp = "NC_000006.11:g.49917154_49917155delinsAA"; + let res = norm5.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + { + let raw = HgvsVariant::from_str( + "NC_000003.12:g.46709584_46709610del27insAAGAAGAAGAAGAAGAAGAAGAAGAAG", + )?; + let exp = "NC_000003.12:g.46709584_46709610="; + let res = norm5.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + + { + let raw = HgvsVariant::from_str("NC_000009.11:g.36233991_36233992delCAinsTG")?; + let exp = "NC_000009.11:g.36233991_36233992inv"; + let res = norm.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + + { + let raw = HgvsVariant::from_str("NG_032871.1:g.32476_53457delinsAATTAAGGTATA")?; + // with self.assertRaises(HGVSInvalidVariantError): + assert!(norm.normalize(&raw).is_err()); + } + { + let raw = HgvsVariant::from_str("NG_032871.1:g.32476_53457delinsAATTAAGGTATA")?; + // with self.assertRaises(HGVSInvalidVariantError): + assert!(norm.normalize(&raw).is_err()); + } + + Ok(()) + } + + #[test] + fn normalize_near_tx_start_end() -> Result<(), anyhow::Error> { + let mapper = Mapper::new(&Default::default(), build_provider()?); + let (norm, norm5, normc, norm5c) = normalizers(&mapper)?; + + { + // gene OR9A4 + let raw = HgvsVariant::from_str("NM_001001656.1:c.935T>C")?; + let exp = "NM_001001656.1:c.935T>C"; + let res = norm.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + { + let raw = HgvsVariant::from_str("NM_001001656.1:c.945G>C")?; + let exp = "NM_001001656.1:c.945G>C"; + let res = norm.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + { + let raw = HgvsVariant::from_str("NM_001001656.1:c.945dup")?; + let exp = "NM_001001656.1:c.945dup"; + let res = norm.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + { + let raw = HgvsVariant::from_str("NM_001001656.1:c.935_945del")?; + let exp = "NM_001001656.1:c.935_945del"; + let res = norm.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + + { + let raw = HgvsVariant::from_str("NM_001001656.1:c.946G>C")?; + // with self.assertRaises(HGVSError): + assert!(norm.normalize(&raw).is_err()); + } + { + let raw = HgvsVariant::from_str("NM_001001656.1:c.946dup")?; + // with self.assertRaises(HGVSError): + assert!(norm.normalize(&raw).is_err()); + } + { + let raw = HgvsVariant::from_str("NM_001001656.1:c.935_946del")?; + // with self.assertRaises(HGVSError): + assert!(norm.normalize(&raw).is_err()); + } + + { + let raw = HgvsVariant::from_str("NM_001001656.1:c.935T>C")?; + let exp = "NM_001001656.1:c.935T>C"; + let res = normc.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + { + let raw = HgvsVariant::from_str("NM_001001656.1:c.945G>C")?; + let exp = "NM_001001656.1:c.945G>C"; + let res = normc.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + { + let raw = HgvsVariant::from_str("NM_001001656.1:c.945dup")?; + let exp = "NM_001001656.1:c.945dup"; + let res = normc.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + { + let raw = HgvsVariant::from_str("NM_001001656.1:c.935_945del")?; + let exp = "NM_001001656.1:c.935_945del"; + let res = normc.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + { + let raw = HgvsVariant::from_str("NM_001001656.1:c.946G>C")?; + // with self.assertRaises(HGVSError): + assert!(normc.normalize(&raw).is_err()); + } + { + let raw = HgvsVariant::from_str("NM_001001656.1:c.946dup")?; + // with self.assertRaises(HGVSError): + assert!(normc.normalize(&raw).is_err()); + } + { + let raw = HgvsVariant::from_str("NM_001001656.1:c.935_946del")?; + // with self.assertRaises(HGVSError): + assert!(normc.normalize(&raw).is_err()); + } + + { + let raw = HgvsVariant::from_str("NM_001001656.1:c.1A>T")?; + let exp = "NM_001001656.1:c.1A>T"; + let res = norm5.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + { + let raw = HgvsVariant::from_str("NM_001001656.1:c.1del")?; + let exp = "NM_001001656.1:c.1del"; + let res = norm5.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + { + let raw = HgvsVariant::from_str("NM_001001656.1:c.1dup")?; + let exp = "NM_001001656.1:c.1dup"; + let res = norm5.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + + { + let raw = HgvsVariant::from_str("NM_001001656.1:c.1A>T")?; + let exp = "NM_001001656.1:c.1A>T"; + let res = norm5c.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + { + let raw = HgvsVariant::from_str("NM_001001656.1:c.1del")?; + let exp = "NM_001001656.1:c.1del"; + let res = norm5c.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + { + let raw = HgvsVariant::from_str("NM_001001656.1:c.1dup")?; + let exp = "NM_001001656.1:c.1dup"; + let res = norm5c.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + + { + let raw = HgvsVariant::from_str("NM_212556.2:c.1_2insCA")?; + let exp = "NM_212556.2:c.1_2insCA"; + let res = norm.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + { + let raw = HgvsVariant::from_str("NM_212556.2:c.2_3insCAT")?; + let exp = "NM_212556.2:c.2_3insCAT"; + let res = norm.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + { + let raw = HgvsVariant::from_str("NM_212556.2:c.1delinsCA")?; + let exp = "NM_212556.2:c.1delinsCA"; + let res = norm.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + + { + let raw = HgvsVariant::from_str("NM_212556.2:c.1_2insCA")?; + let exp = "NM_212556.2:c.1delinsACA"; + let res = norm5.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + { + let raw = HgvsVariant::from_str("NM_212556.2:c.2_3insCAT")?; + let exp = "NM_212556.2:c.1delinsATCA"; + let res = norm5.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + { + let raw = HgvsVariant::from_str("NM_212556.2:c.1delinsCA")?; + let exp = "NM_212556.2:c.1delinsCA"; + let res = norm5.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + { + let raw = HgvsVariant::from_str("NM_212556.2:c.1delinsAA")?; + let exp = "NM_212556.2:c.1dup"; + let res = norm5.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + + { + let raw = HgvsVariant::from_str("NM_212556.2:c.1400_1401insAC")?; + let exp = "NM_212556.2:c.1401delinsACA"; + let res = norm.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + { + let raw = HgvsVariant::from_str("NM_212556.2:c.1400_1401insAC")?; + let exp = "NM_212556.2:c.1401delinsACA"; + let res = normc.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + { + let raw = HgvsVariant::from_str("NM_212556.2:c.1401delinsAA")?; + let exp = "NM_212556.2:c.1401dup"; + let res = normc.normalize(&raw)?; + + assert_eq!( + format!("{}", &NoRef(&res)), + exp, + "{:?} ~ {:?} ~ {:?}", + &res, + &raw, + &exp + ); + } + + Ok(()) + } +} diff --git a/src/parser/display.rs b/src/parser/display.rs index 492f35a..3e1b585 100644 --- a/src/parser/display.rs +++ b/src/parser/display.rs @@ -1,9 +1,29 @@ //! Implementation of Display trait. +//! +//! Also, we implement a `NoRef` newtype that can be used for suppressing +//! output of the reference alleles. This is mainly useful for running the +//! tests with the same data as the Python hgvs module. use std::fmt::Display; use crate::parser::ds::*; +/// Newtype that allows to suppress printing of reference bases. +pub struct NoRef<'a, T>(pub &'a T) +where + T: Display; + +impl NoRef<'_, T> +where + T: Display, +{ + pub fn inner(&self) -> &T { + match self { + NoRef(value) => value, + } + } +} + impl Display for Mu where T: Display, @@ -16,6 +36,19 @@ where } } +impl<'a, T> Display for NoRef<'a, Mu> +where + T: Display, + NoRef<'a, T>: std::fmt::Display, +{ + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + NoRef(Mu::Certain(value)) => write!(f, "{}", NoRef(value)), + NoRef(Mu::Uncertain(value)) => write!(f, "({})", NoRef(value)), + } + } +} + impl Display for GeneSymbol { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}", self.value) @@ -51,6 +84,31 @@ impl Display for NaEdit { } } +impl<'a> Display for NoRef<'a, NaEdit> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + NoRef(NaEdit::RefAlt { + reference, + alternative, + }) => match (reference.len(), alternative.len()) { + (0, 0) => write!(f, "="), + (1, 1) => write!(f, "{reference}>{alternative}"), + (_, 0) => write!(f, "delins"), + (_, _) => write!(f, "delins{alternative}"), + }, + NoRef(NaEdit::NumAlt { count, alternative }) => match (count, alternative.len()) { + (0, 0) => write!(f, "="), + (_, 0) => write!(f, "delins"), + (_, _) => write!(f, "delins{alternative}"), + }, + NoRef(NaEdit::DelRef { .. }) | NoRef(NaEdit::DelNum { .. }) => write!(f, "del"), + NoRef(NaEdit::Ins { alternative }) => write!(f, "ins{alternative}"), + NoRef(NaEdit::Dup { .. }) => write!(f, "dup"), + NoRef(NaEdit::InvRef { .. }) | NoRef(NaEdit::InvNum { .. }) => write!(f, "inv"), + } + } +} + impl Display for UncertainLengthChange { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { @@ -164,12 +222,24 @@ impl Display for ProtLocEdit { } } +impl<'a> Display for NoRef<'a, ProtLocEdit> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.inner().fmt(f) + } +} + impl Display for CdsLocEdit { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}{}", self.loc, self.edit) } } +impl<'a> Display for NoRef<'a, CdsLocEdit> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}{}", self.inner().loc, NoRef(&self.inner().edit)) + } +} + impl Display for CdsInterval { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}", self.start)?; @@ -205,6 +275,12 @@ impl Display for TxLocEdit { } } +impl<'a> Display for NoRef<'a, TxLocEdit> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}{}", self.inner().loc, NoRef(&self.inner().edit)) + } +} + impl Display for TxInterval { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}", self.start)?; @@ -236,6 +312,12 @@ impl Display for RnaLocEdit { } } +impl<'a> Display for NoRef<'a, RnaLocEdit> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}{}", self.inner().loc, NoRef(&self.inner().edit)) + } +} + impl Display for RnaInterval { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}", self.start)?; @@ -267,6 +349,12 @@ impl Display for GenomeLocEdit { } } +impl<'a> Display for NoRef<'a, GenomeLocEdit> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}{}", self.inner().loc, NoRef(&self.inner().edit)) + } +} + impl Display for GenomeInterval { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self.start { @@ -289,6 +377,12 @@ impl Display for MtLocEdit { } } +impl<'a> Display for NoRef<'a, MtLocEdit> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}{}", self.inner().loc, NoRef(&self.inner().edit)) + } +} + impl Display for MtInterval { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self.start { @@ -378,6 +472,79 @@ impl Display for HgvsVariant { } } +impl<'a> Display for NoRef<'a, HgvsVariant> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + NoRef(HgvsVariant::CdsVariant { + accession, + gene_symbol, + loc_edit, + }) => { + write!(f, "{accession}")?; + if let Some(gene_symbol) = gene_symbol { + write!(f, "({gene_symbol})")?; + } + write!(f, ":c.{}", NoRef(loc_edit)) + } + NoRef(HgvsVariant::GenomeVariant { + accession, + gene_symbol, + loc_edit, + }) => { + write!(f, "{accession}")?; + if let Some(gene_symbol) = gene_symbol { + write!(f, "({gene_symbol})")?; + } + write!(f, ":g.{}", NoRef(loc_edit)) + } + NoRef(HgvsVariant::MtVariant { + accession, + gene_symbol, + loc_edit, + }) => { + write!(f, "{accession}")?; + if let Some(gene_symbol) = gene_symbol { + write!(f, "({gene_symbol})")?; + } + write!(f, ":m.{}", NoRef(loc_edit)) + } + NoRef(HgvsVariant::TxVariant { + accession, + gene_symbol, + loc_edit, + }) => { + write!(f, "{accession}")?; + if let Some(gene_symbol) = gene_symbol { + write!(f, "({gene_symbol})")?; + } + write!(f, ":n.{}", NoRef(loc_edit)) + } + NoRef(HgvsVariant::ProtVariant { + accession, + gene_symbol, + loc_edit, + }) => { + write!(f, "{accession}")?; + if let Some(gene_symbol) = gene_symbol { + write!(f, "({gene_symbol})")?; + } + write!(f, ":p.{}", NoRef(loc_edit)) + } + NoRef(HgvsVariant::RnaVariant { + accession, + gene_symbol, + loc_edit, + }) => { + write!(f, "{accession}")?; + if let Some(gene_symbol) = gene_symbol { + write!(f, "({gene_symbol})")?; + } + write!(f, ":r.{}", NoRef(loc_edit)) + } + } + } +} + #[cfg(test)] mod test { use std::{ diff --git a/src/parser/ds.rs b/src/parser/ds.rs index ba6a0df..2b5f830 100644 --- a/src/parser/ds.rs +++ b/src/parser/ds.rs @@ -1,5 +1,9 @@ //! Data structures for representing HGVS variant descriptions. +use std::ops::{Deref, Range}; + +use log::warn; + /// Expression of "maybe uncertain". #[derive(Clone, Debug, PartialEq)] pub enum Mu { @@ -18,6 +22,13 @@ impl Mu { } } + pub fn is_certain(&self) -> bool { + match &self { + Mu::Certain(_) => true, + Mu::Uncertain(_) => false, + } + } + pub fn unwrap(self) -> T { match self { Mu::Certain(value) => value, @@ -31,6 +42,13 @@ impl Mu { Mu::Uncertain(value) => value, } } + + pub fn inner_mut(&mut self) -> &mut T { + match self { + Mu::Certain(value) => value, + Mu::Uncertain(value) => value, + } + } } /// Representation of gene symbol, e.g., `TTN` or `Ttn`. @@ -39,6 +57,26 @@ pub struct GeneSymbol { pub value: String, } +impl GeneSymbol { + pub fn new(value: &str) -> Self { + Self { + value: value.to_string(), + } + } + + pub fn from(value: String) -> Self { + Self { value } + } +} + +impl Deref for GeneSymbol { + type Target = String; + + fn deref(&self) -> &Self::Target { + &self.value + } +} + /// Edit of nucleic acids. #[derive(Clone, Debug, PartialEq)] pub enum NaEdit { @@ -64,6 +102,49 @@ pub enum NaEdit { InvNum { count: i32 }, } +impl NaEdit { + /// Return whether the reference equals the given value. + pub fn reference_equals(&self, value: &str) -> bool { + match self { + NaEdit::RefAlt { reference, .. } + | NaEdit::DelRef { reference } + | NaEdit::Dup { reference } + | NaEdit::InvRef { reference } => reference == value, + _ => false, + } + } + + /// Return an updated `NaEdit` that has the reference replaced with the given sequence. + pub fn with_reference(self, reference: String) -> Self { + match self { + NaEdit::RefAlt { alternative, .. } | NaEdit::NumAlt { alternative, .. } => { + // let (_, reference, alternative) = trim_common_suffixes(&reference, &alternative); + // let (_, reference, alternative) = trim_common_prefixes(&reference, &alternative); + if reference == alternative { + NaEdit::RefAlt { + reference: "".to_string(), + alternative: "".to_string(), + } + } else { + NaEdit::RefAlt { + reference, + alternative, + } + } + } + NaEdit::DelRef { .. } => NaEdit::DelRef { reference }, + NaEdit::DelNum { .. } => NaEdit::DelRef { reference }, + NaEdit::Ins { alternative } => { + warn!("Calling with_reference() on NaEdit::Ins"); + NaEdit::Ins { alternative } + } + NaEdit::Dup { .. } => NaEdit::Dup { reference }, + NaEdit::InvRef { .. } => NaEdit::InvRef { reference }, + NaEdit::InvNum { .. } => NaEdit::InvRef { reference }, + } + } +} + /// Uncertain change through extension. #[derive(Clone, Debug, PartialEq)] pub enum UncertainLengthChange { @@ -78,6 +159,26 @@ pub struct Accession { pub value: String, } +impl lazy_static::__Deref for Accession { + type Target = String; + + fn deref(&self) -> &Self::Target { + &self.value + } +} + +impl Accession { + pub fn new(value: &str) -> Self { + Self { + value: value.to_string(), + } + } + + pub fn from(value: String) -> Self { + Self { value } + } +} + /// Protein edit with interval end edit. #[derive(Clone, Debug, PartialEq)] pub enum ProteinEdit { @@ -154,6 +255,185 @@ pub enum HgvsVariant { }, } +impl HgvsVariant { + // Replace reference sequence. + pub fn with_reference(self, value: String) -> Self { + match self { + HgvsVariant::CdsVariant { + accession, + gene_symbol, + loc_edit, + } => HgvsVariant::CdsVariant { + accession, + gene_symbol, + loc_edit: loc_edit.with_reference(value), + }, + HgvsVariant::GenomeVariant { + accession, + gene_symbol, + loc_edit, + } => HgvsVariant::GenomeVariant { + accession, + gene_symbol, + loc_edit: loc_edit.with_reference(value), + }, + HgvsVariant::MtVariant { + accession, + gene_symbol, + loc_edit, + } => HgvsVariant::MtVariant { + accession, + gene_symbol, + loc_edit: loc_edit.with_reference(value), + }, + HgvsVariant::TxVariant { + accession, + gene_symbol, + loc_edit, + } => HgvsVariant::TxVariant { + accession, + gene_symbol, + loc_edit: loc_edit.with_reference(value), + }, + HgvsVariant::ProtVariant { + accession, + gene_symbol, + loc_edit, + } => { + warn!("Calling with_reference on ProtVariant"); + HgvsVariant::ProtVariant { + accession, + gene_symbol, + loc_edit, + } + } + HgvsVariant::RnaVariant { + accession, + gene_symbol, + loc_edit, + } => HgvsVariant::RnaVariant { + accession, + gene_symbol, + loc_edit: loc_edit.with_reference(value), + }, + } + } + + /// Return the accession. + pub fn accession(&self) -> &Accession { + match self { + HgvsVariant::CdsVariant { accession, .. } => accession, + HgvsVariant::GenomeVariant { accession, .. } => accession, + HgvsVariant::MtVariant { accession, .. } => accession, + HgvsVariant::TxVariant { accession, .. } => accession, + HgvsVariant::ProtVariant { accession, .. } => accession, + HgvsVariant::RnaVariant { accession, .. } => accession, + } + } + + /// Return the 0-based range of the location, possibly wrapped into `Mu` + pub fn mu_loc_range(&self) -> Option>> { + match self { + HgvsVariant::CdsVariant { loc_edit, .. } => loc_edit + .loc + .inner() + .clone() + .try_into() + .ok() + .map(|l| Mu::from(l, loc_edit.loc.is_certain())), + HgvsVariant::GenomeVariant { loc_edit, .. } => loc_edit + .loc + .inner() + .clone() + .try_into() + .ok() + .map(|l| Mu::from(l, loc_edit.loc.is_certain())), + HgvsVariant::MtVariant { loc_edit, .. } => loc_edit + .loc + .inner() + .clone() + .try_into() + .ok() + .map(|l| Mu::from(l, loc_edit.loc.is_certain())), + HgvsVariant::TxVariant { loc_edit, .. } => loc_edit + .loc + .inner() + .clone() + .try_into() + .ok() + .map(|l| Mu::from(l, loc_edit.loc.is_certain())), + HgvsVariant::RnaVariant { loc_edit, .. } => loc_edit + .loc + .inner() + .clone() + .try_into() + .ok() + .map(|l| Mu::from(l, loc_edit.loc.is_certain())), + HgvsVariant::ProtVariant { + loc_edit: ProtLocEdit::Ordinary { loc, .. }, + .. + } => Some(Mu::from(loc.inner().clone().into(), loc.is_certain())), + _ => None, + } + } + + pub fn loc_range(&self) -> Option> { + self.mu_loc_range().map(|l| l.inner().clone()) + } + + /// Return the `NaEdit` wrapped in `Mu`, if any. + pub fn mu_na_edit(&self) -> Option<&Mu> { + match self { + HgvsVariant::CdsVariant { loc_edit, .. } => Some(&loc_edit.edit), + HgvsVariant::GenomeVariant { loc_edit, .. } => Some(&loc_edit.edit), + HgvsVariant::MtVariant { loc_edit, .. } => Some(&loc_edit.edit), + HgvsVariant::TxVariant { loc_edit, .. } => Some(&loc_edit.edit), + HgvsVariant::RnaVariant { loc_edit, .. } => Some(&loc_edit.edit), + _ => None, + } + } + + /// Return the `NaEdit` if any. + pub fn na_edit(&self) -> Option<&NaEdit> { + self.mu_na_edit().map(|e| e.inner()) + } + + /// Return the `ProtLocEdit` if any. + pub fn mu_prot_edit(&self) -> Option<&Mu> { + match self { + HgvsVariant::ProtVariant { + loc_edit: ProtLocEdit::Ordinary { edit, .. }, + .. + } => Some(edit), + _ => None, + } + } + + /// Return the `ProtLocEdit` wrapped in `Mu`, if any. + pub fn prot_edit(&self) -> Option<&ProteinEdit> { + self.mu_prot_edit().map(|e| e.inner()) + } + + /// Return whether start or end position is intronic (offset != 0). + pub fn spans_intron(&self) -> bool { + match self { + HgvsVariant::CdsVariant { loc_edit, .. } => { + loc_edit.loc.inner().start.offset.unwrap_or_default() > 0 + || loc_edit.loc.inner().end.offset.unwrap_or_default() > 0 + } + HgvsVariant::TxVariant { loc_edit, .. } => { + loc_edit.loc.inner().start.offset.unwrap_or_default() > 0 + || loc_edit.loc.inner().end.offset.unwrap_or_default() > 0 + } + HgvsVariant::RnaVariant { loc_edit, .. } => { + loc_edit.loc.inner().start.offset.unwrap_or_default() > 0 + || loc_edit.loc.inner().end.offset.unwrap_or_default() > 0 + } + _ => false, + } + } +} + /// Coding sequence location with edit. #[derive(Clone, Debug, PartialEq)] pub struct CdsLocEdit { @@ -163,6 +443,19 @@ pub struct CdsLocEdit { pub edit: Mu, } +impl CdsLocEdit { + /// Return the LocEdit with the reference replaced by `reference`. + fn with_reference(self, reference: String) -> Self { + CdsLocEdit { + loc: self.loc, + edit: match self.edit { + Mu::Certain(edit) => Mu::Certain(edit.with_reference(reference)), + Mu::Uncertain(edit) => Mu::Uncertain(edit.with_reference(reference)), + }, + } + } +} + /// CDS position interval. #[derive(Clone, Debug, PartialEq)] pub struct CdsInterval { @@ -172,6 +465,29 @@ pub struct CdsInterval { pub end: CdsPos, } +impl TryFrom for Range { + type Error = anyhow::Error; + + /// The CDS interval will be converted from 1-based inclusive coordinates + /// `[start, end]` to 0-based, half-open Rust range `[start - 1, end)`. + fn try_from(value: CdsInterval) -> Result { + if value.start.offset.is_some() || value.end.offset.is_some() { + warn!("Converting interval {:?} with offset to range!", &value); + } + if value.start.cds_from != value.end.cds_from { + Err(anyhow::anyhow!( + "Conversion of interval with different offsets (CDS start/end) is ill-defined" + )) + } else { + Ok(if value.start.base > 0 { + (value.start.base - 1)..value.end.base + } else { + value.start.base..(value.end.base + 1) + }) + } + } +} + /// Specifies whether the CDS position is relative to the CDS start or /// CDS end. #[derive(Clone, Debug, PartialEq)] @@ -200,6 +516,19 @@ pub struct GenomeLocEdit { pub edit: Mu, } +impl GenomeLocEdit { + /// Return the LocEdit with the reference replaced by `reference`. + fn with_reference(self, reference: String) -> Self { + GenomeLocEdit { + loc: self.loc, + edit: match self.edit { + Mu::Certain(edit) => Mu::Certain(edit.with_reference(reference)), + Mu::Uncertain(edit) => Mu::Uncertain(edit.with_reference(reference)), + }, + } + } +} + /// Genome position interval. #[derive(Clone, Debug, PartialEq)] pub struct GenomeInterval { @@ -209,6 +538,23 @@ pub struct GenomeInterval { pub end: Option, } +impl TryInto> for GenomeInterval { + type Error = anyhow::Error; + + /// The genome interval will be converted from 1-based inclusive coordinates + /// `[start, end]` to 0-based, half-open Rust range `[start - 1, end)`. + fn try_into(self) -> Result, Self::Error> { + if let (Some(start), Some(end)) = (self.start, self.end) { + Ok((start - 1)..end) + } else { + Err(anyhow::anyhow!( + "Cannot convert interval with None position into range: {:?}", + self + )) + } + } +} + /// Mitochondrial sequence location with edit. #[derive(Clone, Debug, PartialEq)] pub struct MtLocEdit { @@ -218,6 +564,18 @@ pub struct MtLocEdit { pub edit: Mu, } +impl MtLocEdit { + /// Return the LocEdit with the reference replaced by `reference`. + fn with_reference(self, reference: String) -> Self { + MtLocEdit { + loc: self.loc, + edit: match self.edit { + Mu::Certain(edit) => Mu::Certain(edit.with_reference(reference)), + Mu::Uncertain(edit) => Mu::Uncertain(edit.with_reference(reference)), + }, + } + } +} /// Mitochondrial position interval. #[derive(Clone, Debug, PartialEq)] pub struct MtInterval { @@ -227,6 +585,23 @@ pub struct MtInterval { pub end: Option, } +impl TryInto> for MtInterval { + type Error = anyhow::Error; + + /// The MT interval will be converted from 1-based inclusive coordinates + /// `[start, end]` to 0-based, half-open Rust range `[start - 1, end)`. + fn try_into(self) -> Result, Self::Error> { + if let (Some(start), Some(end)) = (self.start, self.end) { + Ok((start - 1)..end) + } else { + Err(anyhow::anyhow!( + "Cannot convert interval with None position into range: {:?}", + self + )) + } + } +} + /// Transcript sequence location with edit. #[derive(Clone, Debug, PartialEq)] pub struct TxLocEdit { @@ -236,6 +611,19 @@ pub struct TxLocEdit { pub edit: Mu, } +impl TxLocEdit { + /// Return the LocEdit with the reference replaced by `reference`. + fn with_reference(self, reference: String) -> Self { + TxLocEdit { + loc: self.loc, + edit: match self.edit { + Mu::Certain(edit) => Mu::Certain(edit.with_reference(reference)), + Mu::Uncertain(edit) => Mu::Uncertain(edit.with_reference(reference)), + }, + } + } +} + /// Transcript position interval. #[derive(Clone, Debug, PartialEq)] pub struct TxInterval { @@ -245,6 +633,21 @@ pub struct TxInterval { pub end: TxPos, } +impl From for Range { + /// The transcript interval will be converted from 1-based inclusive coordinates + /// `[start, end]` to 0-based, half-open Rust range `[start - 1, end)`. + fn from(val: TxInterval) -> Self { + if val.start.offset.is_some() || val.end.offset.is_some() { + warn!("Converting interval {:?} with offset to range!", &val); + } + if val.start.base > 0 { + (val.start.base - 1)..val.end.base + } else { + val.start.base..(val.end.base + 1) + } + } +} + /// Transcript position. #[derive(Clone, Debug, PartialEq)] pub struct TxPos { @@ -263,6 +666,18 @@ pub struct RnaLocEdit { pub edit: Mu, } +impl RnaLocEdit { + /// Return the LocEdit with the reference replaced by `reference`. + fn with_reference(self, reference: String) -> Self { + RnaLocEdit { + loc: self.loc, + edit: match self.edit { + Mu::Certain(edit) => Mu::Certain(edit.with_reference(reference)), + Mu::Uncertain(edit) => Mu::Uncertain(edit.with_reference(reference)), + }, + } + } +} /// RNA position interval. #[derive(Clone, Debug, PartialEq)] pub struct RnaInterval { @@ -272,6 +687,21 @@ pub struct RnaInterval { pub end: RnaPos, } +impl From for Range { + /// The RNA interval will be converted from 1-based inclusive coordinates + /// `[start, end]` to 0-based, half-open Rust range `[start - 1, end)`. + fn from(val: RnaInterval) -> Self { + if val.start.offset.is_some() || val.end.offset.is_some() { + warn!("Converting interval {:?} with offset to range!", &val); + } + if val.start.base > 0 { + (val.start.base - 1)..val.end.base + } else { + val.start.base..(val.end.base + 1) + } + } +} + /// RNA position. #[derive(Clone, Debug, PartialEq)] pub struct RnaPos { @@ -307,6 +737,16 @@ pub struct ProtInterval { pub end: ProtPos, } +impl From for Range { + fn from(val: ProtInterval) -> Self { + if val.start.number > 0 { + (val.start.number - 1)..val.end.number + } else { + val.start.number..(val.end.number + 1) + } + } +} + /// Protein position. #[derive(Clone, Debug, PartialEq)] pub struct ProtPos { diff --git a/src/parser/impl_ops.rs b/src/parser/impl_ops.rs deleted file mode 100644 index 9e08450..0000000 --- a/src/parser/impl_ops.rs +++ /dev/null @@ -1,16 +0,0 @@ -//! Implementation of operations on the data structures. - -use crate::data::interface::Provider; - -use super::HgvsVariant; - -impl HgvsVariant { - /// Fill reference bases using the given data provider. - /// - /// # Args - /// - /// * `provider` -- The `Provider` to use for fetching reference bases. - pub fn fill_ref(&self, _provider: &dyn Provider) -> Result { - todo!() - } -} diff --git a/src/parser/impl_validate.rs b/src/parser/impl_validate.rs new file mode 100644 index 0000000..09db380 --- /dev/null +++ b/src/parser/impl_validate.rs @@ -0,0 +1,232 @@ +//! Provide implementation of validation to data structures. + +use std::ops::Range; + +use crate::validator::Validateable; + +use super::{ + CdsInterval, CdsLocEdit, GenomeInterval, GenomeLocEdit, HgvsVariant, MtLocEdit, NaEdit, + ProtLocEdit, RnaLocEdit, TxLocEdit, +}; + +impl Validateable for NaEdit { + fn validate(&self) -> Result<(), anyhow::Error> { + match &self { + NaEdit::RefAlt { + reference, + alternative, + } => { + if reference.is_empty() && alternative.is_empty() { + Err(anyhow::anyhow!("Reference or alternative must be non-empty; use DelRef or Ins otherwise ({}/{})", reference, alternative)) + } else { + Ok(()) + } + } + NaEdit::NumAlt { count, alternative } => { + if *count < 1 { + Err(anyhow::anyhow!( + "Number of deleted bases must be >=1 but was {}", + count + )) + } else if alternative.is_empty() { + Err(anyhow::anyhow!( + "Number of alternative bases must non-empty", + )) + } else { + Ok(()) + } + } + NaEdit::DelRef { reference: _ } => Ok(()), + NaEdit::DelNum { count } => { + if *count < 1 { + Err(anyhow::anyhow!( + "Number of deleted bases must be >=1 but was {}", + count + )) + } else { + Ok(()) + } + } + NaEdit::Ins { alternative: _ } => Ok(()), + NaEdit::Dup { reference: _ } => Ok(()), + NaEdit::InvRef { reference: _ } => Ok(()), + NaEdit::InvNum { count } => { + if *count < 1 { + Err(anyhow::anyhow!( + "Number of inverted bases must be >=1 but was {}", + count + )) + } else { + Ok(()) + } + } + } + } +} + +impl Validateable for HgvsVariant { + fn validate(&self) -> Result<(), anyhow::Error> { + // NB: we only need to validate `self.loc_edit`. The cases that the Python library + // considers are fended off by the Rust type system. + match &self { + HgvsVariant::CdsVariant { loc_edit, .. } => loc_edit.validate(), + HgvsVariant::GenomeVariant { loc_edit, .. } => loc_edit.validate(), + HgvsVariant::MtVariant { loc_edit, .. } => loc_edit.validate(), + HgvsVariant::TxVariant { loc_edit, .. } => loc_edit.validate(), + HgvsVariant::ProtVariant { loc_edit, .. } => loc_edit.validate(), + HgvsVariant::RnaVariant { loc_edit, .. } => loc_edit.validate(), + } + } +} + +impl Validateable for CdsLocEdit { + fn validate(&self) -> Result<(), anyhow::Error> { + let loc = self.loc.inner(); + loc.validate()?; + + let maybe_range: Result, _> = loc.clone().try_into(); + let range = if let Ok(range) = maybe_range { + range + } else { + log::trace!( + "Skipping CDS location because loc cannot be converted to range: {:?}", + loc + ); + return Ok(()); + }; + + match self.edit.inner() { + NaEdit::RefAlt { reference, .. } + | NaEdit::DelRef { reference } + | NaEdit::Dup { reference } + | NaEdit::InvRef { reference } => { + if !reference.is_empty() && range.len() != reference.len() { + Err(anyhow::anyhow!( + "Length implied by coordinates must equal reference sequence length ({})", + &self + )) + } else { + Ok(()) + } + } + NaEdit::Ins { .. } => { + if range.len() != 2 { + Err(anyhow::anyhow!( + "Locus length must be 1 for insertions ({}, {:?})", + &self, + &range + )) + } else { + Ok(()) + } + } + NaEdit::DelNum { count } | NaEdit::NumAlt { count, .. } | NaEdit::InvNum { count } => { + if range.len() as i32 != *count { + Err(anyhow::anyhow!( + "Length implied by coordinates must equal count ({})", + &self + )) + } else { + Ok(()) + } + } + } + } +} + +impl Validateable for CdsInterval { + fn validate(&self) -> Result<(), anyhow::Error> { + Ok(()) // TODO + } +} + +impl Validateable for GenomeLocEdit { + fn validate(&self) -> Result<(), anyhow::Error> { + self.loc.inner().validate()?; + self.edit.inner().validate() + } +} + +impl Validateable for GenomeInterval { + fn validate(&self) -> Result<(), anyhow::Error> { + if let Some(start) = self.start { + if start < 1 { + return Err(anyhow::anyhow!("Start must be >= 1 but was {}", start)); + } + } + if let Some(end) = self.end { + if end < 1 { + return Err(anyhow::anyhow!("End must be >= 1 but was {}", end)); + } + } + if let (Some(start), Some(end)) = (self.start, self.end) { + if start > end { + return Err(anyhow::anyhow!( + "Start must be <= end position but was {} > {}", + start, + end + )); + } + } + + Ok(()) + } +} + +impl Validateable for MtLocEdit { + fn validate(&self) -> Result<(), anyhow::Error> { + Ok(()) // TODO + } +} + +impl Validateable for TxLocEdit { + fn validate(&self) -> Result<(), anyhow::Error> { + Ok(()) // TODO + } +} + +impl Validateable for RnaLocEdit { + fn validate(&self) -> Result<(), anyhow::Error> { + Ok(()) // TODO + } +} + +impl Validateable for ProtLocEdit { + fn validate(&self) -> Result<(), anyhow::Error> { + Ok(()) // TODO + } +} + +#[cfg(test)] +mod test { + use crate::{parser::GenomeInterval, validator::Validateable}; + + #[test] + fn validate_genomeinterval() -> Result<(), anyhow::Error> { + let g_interval = GenomeInterval { + start: Some(1), + end: Some(2), + }; + assert!(g_interval.validate().is_ok()); + + let g_interval = GenomeInterval { + start: Some(-1), + end: Some(2), + }; + assert!(g_interval.validate().is_err()); + + let g_interval = GenomeInterval { + start: Some(1), + end: Some(-2), + }; + assert!(g_interval.validate().is_err()); + + let g_interval = GenomeInterval { + start: Some(2), + end: Some(1), + }; + assert!(g_interval.validate().is_err()); + + Ok(()) + } +} diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 5bba051..f25dfc1 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -6,8 +6,8 @@ mod display; mod ds; -mod impl_ops; mod impl_parse; +mod impl_validate; mod parse_funcs; use std::str::FromStr; diff --git a/src/utils.rs b/src/utils.rs new file mode 100644 index 0000000..113e427 --- /dev/null +++ b/src/utils.rs @@ -0,0 +1,123 @@ +pub(crate) fn trim_common_prefixes(reference: &str, alternative: &str) -> (usize, String, String) { + if reference.is_empty() || alternative.is_empty() { + return (0, reference.to_string(), alternative.to_string()); + } + + let mut trim = 0; + while trim < reference.len() && trim < alternative.len() { + if reference.chars().nth(trim) != alternative.chars().nth(trim) { + break; + } + + trim += 1; + } + + ( + trim, + reference[trim..].to_string(), + alternative[trim..].to_string(), + ) +} + +pub(crate) fn trim_common_suffixes(reference: &str, alternative: &str) -> (usize, String, String) { + if reference.is_empty() || alternative.is_empty() { + return (0, reference.to_string(), alternative.to_string()); + } + + let mut trim = 0; + let mut i_r = reference.len(); + let mut i_a = alternative.len(); + let mut pad = 0; + while trim < reference.len() && trim < alternative.len() { + trim += 1; + assert!(i_r > 0); + assert!(i_a > 0); + i_r -= 1; + i_a -= 1; + + if reference.chars().nth(i_r) != alternative.chars().nth(i_a) { + pad = 1; + break; + } + } + + ( + trim - pad, + reference[..(i_r + pad)].to_string(), + alternative[..(i_a + pad)].to_string(), + ) +} + +/// Reverse complementing shortcut. +pub(crate) fn revcomp(seq: &str) -> String { + std::str::from_utf8(&bio::alphabets::dna::revcomp(seq.as_bytes())) + .unwrap() + .to_string() +} + +#[cfg(test)] +mod test { + use pretty_assertions::assert_eq; + + use crate::utils::{revcomp, trim_common_prefixes, trim_common_suffixes}; + + #[test] + fn suffix_trimming() { + assert_eq!( + trim_common_suffixes("", ""), + (0, "".to_string(), "".to_string()) + ); + assert_eq!( + trim_common_suffixes("", "C"), + (0, "".to_string(), "C".to_string()) + ); + assert_eq!( + trim_common_suffixes("C", ""), + (0, "C".to_string(), "".to_string()) + ); + assert_eq!( + trim_common_suffixes("A", "AA"), + (1, "".to_string(), "A".to_string()) + ); + assert_eq!( + trim_common_suffixes("AT", "AG"), + (0, "AT".to_string(), "AG".to_string()) + ); + assert_eq!( + trim_common_suffixes("ATCG", "AGCG"), + (2, "AT".to_string(), "AG".to_string()) + ); + } + + #[test] + fn prefix_trimming() { + assert_eq!( + trim_common_prefixes("", ""), + (0, "".to_string(), "".to_string()) + ); + assert_eq!( + trim_common_prefixes("", "C"), + (0, "".to_string(), "C".to_string()) + ); + assert_eq!( + trim_common_prefixes("C", ""), + (0, "C".to_string(), "".to_string()) + ); + assert_eq!( + trim_common_prefixes("TA", "GA"), + (0, "TA".to_string(), "GA".to_string()) + ); + assert_eq!( + trim_common_prefixes("CGTA", "CGGA"), + (2, "TA".to_string(), "GA".to_string()) + ); + } + + #[test] + fn revcomp_cases() { + assert_eq!(revcomp(""), ""); + assert_eq!(revcomp("A"), "T"); + assert_eq!(revcomp("AG"), "CT"); + assert_eq!(revcomp("CGAG"), "CTCG"); + } +} diff --git a/src/validator/mod.rs b/src/validator/mod.rs index 895d52b..b4245d0 100644 --- a/src/validator/mod.rs +++ b/src/validator/mod.rs @@ -2,51 +2,83 @@ use std::rc::Rc; -use crate::{data::interface::Provider, parser::HgvsVariant}; +use log::{error, warn}; +use crate::{ + data::interface::Provider, + mapper::{variant::Config, variant::Mapper}, + parser::HgvsVariant, +}; + +/// Trait for validating of variants, locations etc. +pub trait Validateable { + fn validate(&self) -> Result<(), anyhow::Error>; +} + +/// Validation level specification. #[derive(Debug, PartialEq, Clone, Copy)] pub enum ValidationLevel { + /// No validation. Null, + /// Only inspect the variant description itself. Intrinsic, - Extrinsic, + /// Full validation including checks based on sequence and intrinsics. + Full, } impl ValidationLevel { - pub fn validator(&self, strict: bool, provider: Rc) -> Box { + pub fn validator(&self, strict: bool, provider: Rc) -> Rc { match self { - ValidationLevel::Null => Box::new(NullValidator::new(strict)), - ValidationLevel::Intrinsic => Box::new(IntrinsicValidator::new(strict)), - ValidationLevel::Extrinsic => Box::new(ExtrinsicValidator::new(strict, provider)), + ValidationLevel::Null => Rc::new(NullValidator::new()), + ValidationLevel::Intrinsic => Rc::new(IntrinsicValidator::new(strict)), + ValidationLevel::Full => Rc::new(FullValidator::new(strict, provider)), } } } +/// Trait for validators. pub trait Validator { + /// Return whether validation is strict. + /// + /// Validation is strict if errors cause `Err` results rather than just logging a warning. fn is_strict(&self) -> bool; + /// Validate the given variant. + /// + /// Depending on the configuration and implementation of the validator, an `Err` will be + /// returned or only a warning will be logged. fn validate(&self, var: &HgvsVariant) -> Result<(), anyhow::Error>; } -pub struct NullValidator { - strict: bool, -} +/// A validator that performs no validation. +pub struct NullValidator {} impl NullValidator { - pub fn new(strict: bool) -> Self { - Self { strict } + pub fn new() -> Self { + Self {} + } +} + +impl Default for NullValidator { + fn default() -> Self { + Self::new() } } impl Validator for NullValidator { fn is_strict(&self) -> bool { - self.strict + false } fn validate(&self, _var: &HgvsVariant) -> Result<(), anyhow::Error> { - todo!() + Ok(()) } } +/// A validator that only performs intrinsic validation. +/// +/// This means that only the variant description itself is checked without considering the +/// actual sequence. pub struct IntrinsicValidator { strict: bool, } @@ -62,19 +94,42 @@ impl Validator for IntrinsicValidator { self.strict } - fn validate(&self, _var: &HgvsVariant) -> Result<(), anyhow::Error> { - todo!() + fn validate(&self, var: &HgvsVariant) -> Result<(), anyhow::Error> { + let res = var.validate(); + match (&res, self.is_strict()) { + (Ok(_), _) => Ok(()), + (Err(_), false) => { + warn!("Validation of {} failed: {:?}", var, res); + Ok(()) + } + (Err(_), true) => { + error!("Validation of {} failed: {:?}", var, res); + res + } + } } } +/// Attempts to determine if the HGVS name validates against external data sources pub struct ExtrinsicValidator { strict: bool, - provider: Rc, + #[allow(dead_code)] + mapper: Mapper, } impl ExtrinsicValidator { pub fn new(strict: bool, provider: Rc) -> Self { - Self { strict, provider } + let config = Config { + replace_reference: false, + strict_validation: false, + prevalidation_level: ValidationLevel::Null, + add_gene_symbol: false, + strict_bounds: true, + }; + Self { + strict, + mapper: Mapper::new(&config, provider), + } } } @@ -83,7 +138,89 @@ impl Validator for ExtrinsicValidator { self.strict } - fn validate(&self, _var: &HgvsVariant) -> Result<(), anyhow::Error> { - todo!() + fn validate(&self, var: &HgvsVariant) -> Result<(), anyhow::Error> { + // Check transcripts bounds + match var { + HgvsVariant::CdsVariant { .. } | HgvsVariant::TxVariant { .. } => { + let res = self.check_tx_bound(var); + if res.is_err() { + if self.is_strict() { + error!("Validation of {} failed: {:?}", var, res); + return res; + } else { + warn!("Validation of {} failed: {:?}", var, res); + } + } + } + _ => {} + } + + // Check CDS bounds + { + let res = self.check_cds_bound(var); + if res.is_err() { + if self.is_strict() { + error!("Validation of {} failed: {:?}", var, res); + return res; + } else { + warn!("Validation of {} failed: {:?}", var, res); + } + } + } + + // Check reference. + { + let res = self.check_ref(var); + if res.is_err() { + if self.is_strict() { + error!("Validation of {} failed: {:?}", var, res); + return res; + } else { + warn!("Validation of {} failed: {:?}", var, res); + } + } + } + + Ok(()) + } +} + +impl ExtrinsicValidator { + fn check_tx_bound(&self, _var: &HgvsVariant) -> Result<(), anyhow::Error> { + Ok(()) // TODO + } + + fn check_cds_bound(&self, _var: &HgvsVariant) -> Result<(), anyhow::Error> { + Ok(()) // TODO + } + + fn check_ref(&self, _var: &HgvsVariant) -> Result<(), anyhow::Error> { + Ok(()) // TODO + } +} + +/// Full validator performing both intrinsic and extrinsic validation. +pub struct FullValidator { + intrinsic: IntrinsicValidator, + extrinsic: ExtrinsicValidator, +} + +impl FullValidator { + pub fn new(strict: bool, provider: Rc) -> Self { + Self { + intrinsic: IntrinsicValidator::new(strict), + extrinsic: ExtrinsicValidator::new(strict, provider), + } + } +} + +impl Validator for FullValidator { + fn is_strict(&self) -> bool { + self.intrinsic.is_strict() + } + + fn validate(&self, var: &HgvsVariant) -> Result<(), anyhow::Error> { + self.intrinsic.validate(var)?; + self.extrinsic.validate(var) } } diff --git a/tests/data/data/bootstrap.sh b/tests/data/data/bootstrap.sh index 2bc0cab..828cc87 100644 --- a/tests/data/data/bootstrap.sh +++ b/tests/data/data/bootstrap.sh @@ -64,7 +64,31 @@ VERSION=$2 DST=$SCRIPT_DIR # The HGNC symbols of the genes to fetc. -GENES="OMA1 OPA1 LCE3C H2AW LCE2B PTH2 SRD5A2" +set +e +read -r -d '' GENES <