diff --git a/Cargo.toml b/Cargo.toml index 4ef66af..7500045 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,6 +5,7 @@ edition = "2021" [dependencies] anyhow = "1.0.69" +base16ct = "0.2.0" bio = "1.1.0" chrono = "0.4.23" enum-map = "2.4.2" @@ -12,7 +13,9 @@ flate2 = "1.0.25" lazy_static = "1.4.0" linked-hash-map = "0.5.6" log = "0.4.17" +md-5 = "0.10.5" nom = "7.1.3" +phf = { version = "0.11.1", features = ["macros"] } postgres = { version = "0.19.4", features = ["with-chrono-0_4"] } pretty_assertions = "1.3.0" regex = "1.7.1" @@ -21,5 +24,7 @@ serde = { version = "1.0.152", features = ["derive"] } serde_json = "1.0.93" [dev-dependencies] +csv = "1.2.0" env_logger = "0.10.0" +serde = { version = "1.0.152", features = ["derive"] } test-log = "0.2.11" diff --git a/src/data/interface.rs b/src/data/interface.rs index 93c657f..88405e1 100644 --- a/src/data/interface.rs +++ b/src/data/interface.rs @@ -232,6 +232,12 @@ pub trait Provider { end: Option, ) -> Result; + /// Returns a list of protein accessions for a given sequence. + /// + /// The list is guaranteed to contain at least one element with the MD5-based accession + /// (MD5_01234abc..def56789) at the end of the list. + fn get_acs_for_protein_seq(&self, seq: &str) -> Result, anyhow::Error>; + /// Return a list of transcripts that are similar to the given transcript, with relevant /// similarity criteria. /// diff --git a/src/data/uta.rs b/src/data/uta.rs index 4c66d0c..638dc12 100644 --- a/src/data/uta.rs +++ b/src/data/uta.rs @@ -7,6 +7,7 @@ use postgres::{Client, NoTls, Row}; use std::fmt::Debug; use std::sync::Mutex; +use crate::sequences::seq_md5; use crate::static_data::{Assembly, ASSEMBLY_INFOS}; use crate::data::{ @@ -276,6 +277,23 @@ impl ProviderInterface for Provider { Ok(seq[begin..end].into()) } + fn get_acs_for_protein_seq(&self, seq: &str) -> Result, anyhow::Error> { + let md5 = seq_md5(seq, true)?; + let sql = format!( + "SELECT ac FROM {}.seq_anno WHERE seq_id = $1", + self.config.db_schema + ); + + let mut result = Vec::new(); + for row in self.conn.lock().unwrap().query(&sql, &[&md5])? { + result.push(row.get(0)); + } + + // Add sentinel sequence. + result.push(format!("MD5_{}", &md5)); + Ok(result) + } + fn get_similar_transcripts( &self, tx_ac: &str, diff --git a/src/data/uta_sr.rs b/src/data/uta_sr.rs index c69c128..e837cbf 100644 --- a/src/data/uta_sr.rs +++ b/src/data/uta_sr.rs @@ -124,6 +124,10 @@ impl ProviderInterface for Provider { self.seqrepo.fetch_sequence_part(&aos, begin, end) } + fn get_acs_for_protein_seq(&self, seq: &str) -> Result, anyhow::Error> { + self.inner.get_acs_for_protein_seq(seq) + } + fn get_similar_transcripts( &self, tx_ac: &str, diff --git a/src/lib.rs b/src/lib.rs index a81b443..2b64079 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,6 +2,6 @@ pub mod data; pub mod mapper; pub mod normalizer; pub mod parser; +pub(crate) mod sequences; pub mod static_data; -pub(crate) mod utils; pub mod validator; diff --git a/src/mapper/alignment.rs b/src/mapper/alignment.rs index 77ab15e..b2487bb 100644 --- a/src/mapper/alignment.rs +++ b/src/mapper/alignment.rs @@ -484,15 +484,12 @@ impl Mapper { #[cfg(test)] mod test { - use std::{rc::Rc, str::FromStr}; + use std::str::FromStr; use pretty_assertions::assert_eq; use crate::{ - data::{ - interface::{Provider as Interface, TxExonsRecord}, - uta::{Config, Provider}, - }, + data::{interface::TxExonsRecord, uta_sr::test_helpers::build_provider}, parser::{CdsFrom, CdsInterval, CdsPos, GenomeInterval, Mu, TxInterval, TxPos}, }; @@ -563,21 +560,11 @@ mod test { assert_eq!(none_if_default(-1i32), Some(-1i32)); } - fn get_config() -> Config { - Config { - db_url: std::env::var("TEST_UTA_DATABASE_URL") - .expect("Environment variable TEST_UTA_DATABASE_URL undefined!"), - db_schema: std::env::var("TEST_UTA_DATABASE_SCHEMA") - .expect("Environment variable TEST_UTA_DATABASE_SCHEMA undefined!"), - } - } - #[test] fn construction() -> Result<(), anyhow::Error> { - let config = get_config(); - let provider = Provider::with_config(&config)?; + let provider = build_provider()?; - assert_eq!(provider.data_version(), config.db_schema); + assert_eq!(provider.data_version(), "uta_20210129"); assert_eq!(provider.schema_version(), "1.1"); Ok(()) @@ -585,8 +572,7 @@ mod test { #[test] fn failures() -> Result<(), anyhow::Error> { - let config = get_config(); - let provider = Rc::new(Provider::with_config(&config)?); + let provider = build_provider()?; // unknown sequences @@ -640,8 +626,7 @@ mod test { alt_ac: &str, cases: &Vec<(GenomeInterval, TxInterval, CdsInterval)>, ) -> Result<(), anyhow::Error> { - let config = get_config(); - let provider = Rc::new(Provider::with_config(&config)?); + let provider = build_provider()?; let mapper = Mapper::new(provider, tx_ac, alt_ac, "splign")?; for (g_interval, n_interval, c_interval) in cases { diff --git a/src/mapper/altseq.rs b/src/mapper/altseq.rs new file mode 100644 index 0000000..aa039a4 --- /dev/null +++ b/src/mapper/altseq.rs @@ -0,0 +1,1020 @@ +//! Code for building alternative sequence and convertion to HGVS.p. + +use std::{cmp::Ordering, rc::Rc}; + +use crate::{ + data::interface::Provider, + parser::{ + Accession, CdsFrom, HgvsVariant, Mu, NaEdit, ProtInterval, ProtLocEdit, ProtPos, + ProteinEdit, UncertainLengthChange, + }, + sequences::{revcomp, translate_cds, TranslationTable}, +}; + +#[derive(Debug, Clone)] +pub struct RefTranscriptData { + /// Transcript nucleotide sequence. + pub transcript_sequence: String, + /// Translated amino acid sequence. + pub aa_sequence: String, + /// 1-based CDS start position on transcript. + pub cds_start: i32, + /// 1-based CDS end position on transcript. + pub cds_stop: i32, + /// Accession of the protein or `MD5_${md5sum}`. + pub protein_accession: String, +} + +impl RefTranscriptData { + /// Construct new instance fetching data from the provider.. + /// + /// # Args + /// + /// * `provider` -- Data provider to query. + /// * `tx_ac` -- Transcript accession. + /// * `pro_ac` -- Protein accession. + pub fn new( + provider: Rc, + tx_ac: &str, + pro_ac: Option<&str>, + ) -> Result { + let tx_info = provider.as_ref().get_tx_identity_info(tx_ac)?; + let transcript_sequence = provider.as_ref().get_seq(tx_ac)?; + + // Use 1-based HGVS coordinates. + let cds_start = tx_info.cds_start_i + 1; + let cds_stop = tx_info.cds_end_i; + + // Coding sequences taht are not divisable by 3 are not yet supported. + let tx_seq_to_translate = + &transcript_sequence[((cds_start - 1) as usize)..(cds_stop as usize)]; + if tx_seq_to_translate.len() % 3 != 0 { + return Err(anyhow::anyhow!( + "Transcript {} is not supported because its sequence length of {} is not divible by 3.", + tx_ac, + tx_seq_to_translate.len())); + } + + let aa_sequence = + translate_cds(tx_seq_to_translate, true, "*", TranslationTable::Standard)?; + let protein_accession = if let Some(pro_ac) = pro_ac { + pro_ac.to_owned() + } else if let Some(pro_ac) = provider.as_ref().get_pro_ac_for_tx_ac(tx_ac)? { + pro_ac + } else { + // get_acs_for_protein_seq() will always return at least the MD5_ accession. + // + // NB: the following comment is from the original Python code. + // + // TODO: drop get_acs_for_protein_seq; use known mapping or digest (wo/pro ac inference) + provider + .as_ref() + .get_acs_for_protein_seq(&aa_sequence)? + .into_iter() + .next() + .unwrap() + }; + + Ok(Self { + transcript_sequence, + aa_sequence, + cds_start, + cds_stop, + protein_accession, + }) + } +} + +pub struct AltTranscriptData { + /// Transcript nucleotide sequence. + #[allow(dead_code)] + transcript_sequence: String, + /// 1-letter amino acid sequence. + aa_sequence: String, + /// 1-based CDS start position. + #[allow(dead_code)] + cds_start: i32, + /// 1-based CDS stop position. + #[allow(dead_code)] + cds_stop: i32, + /// Protein accession number, e.g., `"NP_999999.2"`. + #[allow(dead_code)] + protein_accession: String, + /// Whether this is a frameshift variant. + is_frameshift: bool, + /// 1-based AA start index for this variant. + variant_start_aa: Option, + /// Starting position (AA ref index) of the last framewshift which affects the rest of the + /// sequence, ie.e., not offset by subsequent frameshifts. + frameshift_start: Option, + /// Whether this is a substitution AA variant. + is_substitution: bool, + /// Whether variant is "?". + is_ambiguous: bool, +} + +impl AltTranscriptData { + /// Create a variant sequence using inputs from `VariantInserter`. + #[allow(clippy::too_many_arguments)] + pub fn new( + seq: &str, + cds_start: i32, + cds_stop: i32, + is_frameshift: bool, + variant_start_aa: Option, + protein_accession: &str, + is_substitution: bool, + is_ambiguous: bool, + ) -> Result { + let transcript_sequence = seq.to_owned(); + let aa_sequence = if !seq.is_empty() { + let seq_cds = &transcript_sequence[((cds_start - 1) as usize)..]; + let seq_aa = translate_cds(seq_cds, false, "X", TranslationTable::Standard)?; + let stop_pos = seq_aa[..((cds_stop - cds_start + 1) as usize / 3)] + .rfind('*') + .or_else(|| seq_aa.find('*')); + if let Some(stop_pos) = stop_pos { + seq_aa[..(stop_pos + 1)].to_owned() + } else { + seq_aa + } + } else { + "".to_owned() + }; + + Ok(Self { + transcript_sequence, + aa_sequence, + cds_start, + cds_stop, + protein_accession: protein_accession.to_owned(), + is_frameshift, + variant_start_aa, + frameshift_start: None, + is_substitution, + is_ambiguous, + }) + } +} + +/// Utility enum for locating variant in a transcript. +enum VariantLocation { + Exon, + Intron, + FivePrimeUtr, + ThreePrimeUtr, + WholeGene, +} + +/// Utility enum for edit type. +enum EditType { + NaRefAlt, + Dup, + Inv, + NotCds, + WholeGeneDeleted, +} + +/// Utility to insert an hgvs variant into a transcript sequence. +/// +/// Generates a record corresponding to the modified transcript sequence, along with annotations +/// for use in conversion to an hgvsp tag. Used in hgvsc to hgvsp conversion. +pub struct AltSeqBuilder { + /// HgvsVariant::CdsVariant to build the alternative sequence for. + pub var_c: HgvsVariant, + /// Information about the transcript reference. + pub reference_data: RefTranscriptData, + /// Whether the transcript reference amino acid sequence has multiple stop codons. + pub ref_has_multiple_stops: bool, +} + +impl AltSeqBuilder { + pub fn new(var_c: HgvsVariant, reference_data: RefTranscriptData) -> Self { + if !matches!(&var_c, HgvsVariant::CdsVariant { .. }) { + panic!("Must initialize with HgvsVariant::CdsVariant"); + } + let ref_has_multiple_stops = reference_data.aa_sequence.matches('*').count() > 1; + + Self { + var_c, + reference_data, + ref_has_multiple_stops, + } + } + + /// Given a variant and a sequence, incorporate the variant and return the new sequence. + /// + /// Data structure returned is analogous to the data structure used to return the variant + /// sequence, but with an additional parameter denoting the start of a frameshift that should + /// affect all bases downstream. + /// + /// # Returns + /// + /// Variant sequence data. + pub fn build_altseq(&self) -> Result, anyhow::Error> { + // NB: the following common is from the original Python code. + // Should loop over each allele rather than assume only 1 variant; return a list for now. + + let na_edit = self.var_c.na_edit().expect("Invalid CdsVariant"); + let edit_type = match self.get_variant_region() { + VariantLocation::Exon => match na_edit { + NaEdit::RefAlt { .. } + | NaEdit::NumAlt { .. } + | NaEdit::DelRef { .. } + | NaEdit::DelNum { .. } + | NaEdit::Ins { .. } => EditType::NaRefAlt, + NaEdit::Dup { .. } => EditType::Dup, + NaEdit::InvRef { .. } | NaEdit::InvNum { .. } => EditType::Inv, + }, + VariantLocation::Intron + // NB: the following comment is from the original Python code + // TODO: handle case where variatn introduces a `Met` (new start) + | VariantLocation::FivePrimeUtr + | VariantLocation::ThreePrimeUtr => EditType::NotCds, + VariantLocation::WholeGene => match na_edit { + NaEdit::DelRef {..} | + NaEdit::DelNum { .. } => EditType::WholeGeneDeleted, + NaEdit::Dup { .. } => { + log::warn!("Whole-gene duplication; consequence assumed to not affect protein product"); + EditType::NotCds + }, + NaEdit::InvRef { .. } | + NaEdit::InvNum { .. } => { + log::warn!("Whole-gene inversion; consequence assumed to not affected protein product"); + EditType::NotCds + }, + _ => panic!("Invalid combination of whole gene variant location and NaEdit {na_edit:?}"), + }, + }; + + // Get the start of the "terminal" frameshift (i.e. one never "cancelled out"). + let alt_data = match edit_type { + EditType::NaRefAlt => self.incorporate_delins(), + EditType::Dup => self.incorporate_dup(), + EditType::Inv => self.incorporate_inv(), + EditType::NotCds => self.create_alt_equals_ref_noncds(), + EditType::WholeGeneDeleted => self.create_no_protein(), + }?; + + let alt_data = self.get_frameshift_start(alt_data); + + Ok(vec![alt_data]) + } + + /// Categorize variant by location in transcript. + fn get_variant_region(&self) -> VariantLocation { + match &self.var_c { + HgvsVariant::CdsVariant { loc_edit, .. } => { + let loc = loc_edit.loc.inner(); + if loc.start.cds_from == CdsFrom::End && loc.end.cds_from == CdsFrom::End { + VariantLocation::ThreePrimeUtr + } else if loc.start.base < 0 && loc.end.base < 0 { + VariantLocation::FivePrimeUtr + } else if loc.start.base < 0 && loc.end.cds_from == CdsFrom::End { + VariantLocation::WholeGene + } else if loc.start.offset.is_some() || loc.end.offset.is_some() { + // Leave out anything intronic for now. + VariantLocation::Intron + } else { + // Anything else that contains an exon. + VariantLocation::Exon + } + } + _ => panic!("Must be CDS variant"), + } + } + + /// Helper to setup incorporate function. + /// + /// # Returns + /// + /// `(transcript sequence, cds start [1-based], cds stop [1-based], cds start index in + /// seq [inc, 0-based], cds end index in seq [excl, 0-based])` + fn setup_incorporate(&self) -> (String, i32, i32, usize, usize) { + let mut start_end = Vec::new(); + + match &self.var_c { + HgvsVariant::CdsVariant { loc_edit, .. } => { + for pos in &[&loc_edit.loc.inner().start, &loc_edit.loc.inner().end] { + match pos.cds_from { + CdsFrom::Start => { + if pos.base < 0 { + // 5' UTR + start_end.push(self.reference_data.cds_start as usize - 1); + } else if pos.offset.unwrap_or(0) <= 0 { + start_end.push( + self.reference_data.cds_start as usize - 1 + pos.base as usize + - 1, + ); + } else { + start_end.push( + self.reference_data.cds_start as usize - 1 + pos.base as usize, + ); + } + } + CdsFrom::End => { + // 3' UTR + start_end.push( + self.reference_data.cds_stop as usize + pos.base as usize - 1, + ); + } + } + } + } + _ => panic!("invalid variant"), + } + + ( + self.reference_data.transcript_sequence.to_owned(), + self.reference_data.cds_start, + self.reference_data.cds_stop, + start_end[0], + start_end[1] + 1, + ) + } + + /// Get starting position (AA ref index) of the last frameshift which affects the rest of + /// the sequence, i.e. not offset by subsequent frameshifts. + fn get_frameshift_start(&self, variant_data: AltTranscriptData) -> AltTranscriptData { + AltTranscriptData { + frameshift_start: if variant_data.is_frameshift { + variant_data.variant_start_aa + } else { + variant_data.frameshift_start + }, + ..variant_data + } + } + + fn incorporate_delins(&self) -> Result { + let (mut seq, cds_start, cds_stop, start, end) = self.setup_incorporate(); + let loc_range_start; + + let (reference, alternative) = match &self.var_c { + HgvsVariant::CdsVariant { loc_edit, .. } => { + loc_range_start = loc_edit.loc.inner().start.base; + match loc_edit.edit.inner() { + NaEdit::RefAlt { + reference, + alternative, + } => (Some(reference.clone()), Some(alternative.clone())), + NaEdit::DelRef { reference } => (Some(reference.to_owned()), None), + NaEdit::Ins { alternative } => (None, Some(alternative.to_owned())), + _ => panic!("Can only work with concrete ref/alt"), + } + } + _ => panic!("Can only work on CDS variants"), + }; + let ref_len = reference.as_ref().map(|_| end - start).unwrap_or(0) as i32; + let alt_len = alternative.as_ref().map(|alt| alt.len()).unwrap_or(0) as i32; + let net_base_change = alt_len - ref_len; + let cds_stop = cds_stop + net_base_change; + + // Incorporate the variant into the sequence (depending on the type). + let mut is_substitution = false; + match (reference, alternative) { + (Some(reference), Some(alternative)) => { + // delins or SNP + seq.replace_range(start..end, &alternative); + if reference.len() == 1 && alternative.len() == 1 { + is_substitution = true; + } + } + (Some(_reference), None) => { + // deletion + seq.replace_range(start..end, ""); + } + (None, Some(alternative)) => { + // insertion + seq.insert_str(start + 1, &alternative); + } + _ => panic!("This should not happen"), + } + + let is_frameshift = net_base_change % 3 != 0; + + // Use max. of mod 3 value and 1 (in the event that the indel starts in the 5' UTR range). + let variant_start_aa = std::cmp::max((loc_range_start as f64 / 3.0).ceil() as i32, 1); + + AltTranscriptData::new( + &seq, + cds_start, + cds_stop, + is_frameshift, + Some(variant_start_aa), + &self.reference_data.protein_accession, + is_substitution, + self.ref_has_multiple_stops, + ) + } + + fn incorporate_dup(&self) -> Result { + let (seq, cds_start, cds_stop, start, end) = self.setup_incorporate(); + + let seq = format!( + "{}{}{}{}", + &seq[..start], + &seq[start..end], + &seq[start..end], + &seq[end..] + ); + + let is_frameshift = (end - start) % 3 != 0; + + let loc_range = self + .var_c + .loc_range() + .expect("Could not determine insertion location"); + let variant_start_aa = ((loc_range.end + 1) as f64 / 3.0).ceil() as i32; + + AltTranscriptData::new( + &seq, + cds_start, + cds_stop, + is_frameshift, + Some(variant_start_aa), + &self.reference_data.protein_accession, + false, + self.ref_has_multiple_stops, + ) + } + + /// Incorporate inversion into sequence. + fn incorporate_inv(&self) -> Result { + let (seq, cds_start, cds_stop, start, end) = self.setup_incorporate(); + + let seq = format!( + "{}{}{}", + &seq[..start], + revcomp(&seq[start..end]), + &seq[end..] + ); + + let loc_range = self + .var_c + .loc_range() + .expect("Could not determine insertion location"); + let variant_start_aa = + std::cmp::max((((loc_range.start + 1) as f64) / 3.0).ceil() as i32, 1); + + AltTranscriptData::new( + &seq, + cds_start, + cds_stop, + false, + Some(variant_start_aa), + &self.reference_data.protein_accession, + false, + self.ref_has_multiple_stops, + ) + } + + /// Create an alt seq that matches the reference (for non-CDS variants). + fn create_alt_equals_ref_noncds(&self) -> Result { + AltTranscriptData::new( + &self.reference_data.transcript_sequence, + self.reference_data.cds_start, + self.reference_data.cds_stop, + false, + None, + &self.reference_data.protein_accession, + false, + true, + ) + } + + /// Create a no-protein result. + fn create_no_protein(&self) -> Result { + AltTranscriptData::new( + "", + -1, + -1, + false, + None, + &self.reference_data.protein_accession, + false, + false, + ) + } +} + +/// Build `HgvsVariant::ProtVariant` from information about change to transcript. +pub struct AltSeqToHgvsp { + pub ref_data: RefTranscriptData, + pub alt_data: AltTranscriptData, +} + +#[derive(Debug)] +struct AdHocRecord { + start: i32, + ins: String, + del: String, + is_frameshift: bool, +} + +impl Default for AdHocRecord { + fn default() -> Self { + Self { + start: -1, + ins: "".to_owned(), + del: "".to_owned(), + is_frameshift: false, + } + } +} + +impl AltSeqToHgvsp { + pub fn new(ref_data: RefTranscriptData, alt_data: AltTranscriptData) -> Self { + Self { ref_data, alt_data } + } + + /// Compare two amino acid sequences and generate a HGVS tag from the output. + pub fn build_hgvsp(&self) -> Result { + let mut records = Vec::new(); + + if !self.alt_data.is_ambiguous && !self.alt_seq().is_empty() { + let mut do_delins = true; + if self.ref_seq() == self.alt_seq() { + // Silent p. variant. + let start = self.alt_data.variant_start_aa; + let record = if let Some(start) = start { + if start - 1 < self.ref_seq().len() as i32 { + let del = &self + .ref_seq() + .chars() + .nth(start as usize - 1) + .unwrap() + .to_string(); + AdHocRecord { + start, + ins: del.clone(), + del: del.clone(), + is_frameshift: self.alt_data.is_frameshift, + } + } else { + Default::default() + } + } else { + Default::default() + }; + records.push(record); + do_delins = false; + } else if self.is_substitution() && self.ref_seq().len() == self.alt_seq().len() { + let r = self.ref_seq().chars(); + let a = self.alt_seq().chars(); + let e = r.zip(a).enumerate(); + let mut diff_records = e + .filter(|(_i, (r, a))| r != a) + .map(|(i, (r, a))| AdHocRecord { + start: i as i32 + 1, + del: r.to_string(), + ins: a.to_string(), + is_frameshift: self.alt_data.is_frameshift, + }) + .collect::>(); + if diff_records.len() == 1 { + records.push(diff_records.drain(..).next().unwrap()); + do_delins = false; + } + } + + if do_delins { + let mut start = self.alt_data.variant_start_aa.unwrap() as usize - 1; + while self.ref_seq().chars().nth(start) == self.alt_seq().chars().nth(start) { + start += 1; + } + if self.alt_data.is_frameshift { + // Case: frameshifting delins or dup. + let deletion = &self.ref_seq()[start..]; + let insertion = &self.alt_seq()[start..]; + records.push(AdHocRecord { + start: start as i32 + 1, + ins: insertion.to_owned(), + del: deletion.to_owned(), + is_frameshift: self.alt_data.is_frameshift, + }) + } else { + // Case: non-frameshifting delins or dup. + // + // Get size diff from diff in ref/alt lengths. + let delta = self.alt_seq().len() as isize - self.ref_seq().len() as isize; + let offset = start + delta.unsigned_abs(); + + let (insertion, deletion, ref_sub, alt_sub) = match delta.cmp(&0) { + // if delta > 0 { + Ordering::Greater => ( + // net insertion + self.alt_seq()[start..offset].to_owned(), + "".to_string(), + self.ref_seq()[start..].to_owned(), + self.alt_seq()[offset..].to_owned(), + ), + Ordering::Less => ( + // net deletion + "".to_string(), + self.ref_seq()[start..offset].to_owned(), + self.ref_seq()[offset..].to_owned(), + self.alt_seq()[start..].to_owned(), + ), + Ordering::Equal => ( + // size remains the same + "".to_string(), + "".to_string(), + self.ref_seq()[start..].to_owned(), + self.alt_seq()[start..].to_owned(), + ), + }; + + // From start, get del/ins out to last difference. + let r = ref_sub.chars(); + let a = alt_sub.chars(); + let diff_indices = r + .zip(a) + .enumerate() + .filter(|(_i, (r, a))| r != a) + .map(|(i, _)| i) + .collect::>(); + let diff_indices = if diff_indices.is_empty() + && deletion.is_empty() + && insertion.starts_with('*') + { + vec![0] + } else { + diff_indices + }; + let (deletion, insertion) = if !diff_indices.is_empty() { + let max_diff = diff_indices.last().unwrap() + 1; + ( + format!("{}{}", deletion, &ref_sub[..max_diff]), + format!("{}{}", insertion, &alt_sub[..max_diff]), + ) + } else { + (deletion, insertion) + }; + + records.push(AdHocRecord { + start: start as i32 + 1, + ins: insertion, + del: deletion, + is_frameshift: self.alt_data.is_frameshift, + }); + } + } + } + + if self.alt_data.is_ambiguous { + Ok(self.create_variant( + None, + None, + "", + "", + UncertainLengthChange::None, + false, + self.protein_accession(), + self.alt_data.is_ambiguous, + false, + false, + false, + false, + false, + )?) + } else if self.alt_seq().is_empty() { + Ok(self.create_variant( + None, + None, + "", + "", + UncertainLengthChange::None, + false, + self.protein_accession(), + self.alt_data.is_ambiguous, + false, + false, + true, + false, + false, + )?) + } else if let Some(var) = records.drain(..).next() { + Ok(self.convert_to_hgvs_variant(var, self.protein_accession())?) + } else { + Err(anyhow::anyhow!("Got multiple AA variants - not supported!")) + } + } + + fn protein_accession(&self) -> &str { + &self.ref_data.protein_accession + } + + fn ref_seq(&self) -> &str { + &self.ref_data.aa_sequence + } + + fn alt_seq(&self) -> &str { + &self.alt_data.aa_sequence + } + + fn is_substitution(&self) -> bool { + self.alt_data.is_substitution + } + + fn convert_to_hgvs_variant( + &self, + record: AdHocRecord, + protein_accession: &str, + ) -> Result { + let AdHocRecord { + start, + ins: insertion, + del: deletion, + is_frameshift, + } = &record; + + // defaults + let mut is_dup = false; // assume no dup + let mut fsext_len = UncertainLengthChange::default(); // fs or ext length + let mut is_sub = false; + let mut is_ext = false; + let mut is_init_met = false; + let mut is_ambiguous = self.alt_data.is_ambiguous; + let aa_start; + let aa_end; + let mut reference = String::new(); + let mut alternative = String::new(); + + if *start == 1 { + // initial methionine is modified + // TODO: aa_start/aa_end was in Python code, not needed? + // aa_start = ProtPos { + // aa: "M".to_owned(), + // number: 1, + // }; + // aa_end = aa_start.clone(); + is_init_met = true; + is_ambiguous = true; + } + + if insertion.starts_with('*') { + // stop codon at variant position + aa_start = Some(ProtPos { + aa: deletion.chars().next().unwrap().to_string(), + number: *start, + }); + aa_end = aa_start.clone(); + reference = "".to_string(); + alternative = "*".to_string(); + is_sub = true; + } else if *start as usize == self.ref_seq().len() { + // extension + fsext_len = if self.alt_seq().ends_with('*') { + UncertainLengthChange::Known(insertion.len() as i32 - deletion.len() as i32) + } else { + UncertainLengthChange::Unknown + }; + + aa_start = Some(ProtPos { + aa: "*".to_owned(), + number: *start, + }); + aa_end = aa_start.clone(); + + reference = "".to_owned(); + alternative = insertion.chars().next().unwrap().to_string(); + is_ext = true; + } else if *is_frameshift { + // frameshift + aa_start = Some(ProtPos { + aa: deletion.chars().next().unwrap().to_string(), + number: *start, + }); + aa_end = aa_start.clone(); + + reference = "".to_owned(); + alternative = insertion.chars().next().unwrap().to_string(); + + fsext_len = insertion + .find('*') + .map(|pos| UncertainLengthChange::Known(pos as i32 + 1)) + .unwrap_or(UncertainLengthChange::Unknown); + + // ALL CASES BELOW HERE: no frameshift - sub/delins/dup + } else if insertion == deletion { + // silent + aa_start = if *start == -1 { + None + } else { + Some(ProtPos { + aa: deletion.clone(), + number: *start, + }) + }; + aa_end = aa_start.clone(); + } else if insertion.len() == 1 && deletion.len() == 1 { + // substitution + aa_start = Some(ProtPos { + aa: deletion.clone(), + number: *start, + }); + aa_end = aa_start.clone(); + reference = "".to_owned(); + alternative = insertion.clone(); + is_sub = true; + } else if !deletion.is_empty() { + // delins OR deletion OR stop codon at variant position + reference = deletion.clone(); + let end = start + deletion.len() as i32 - 1; + + aa_start = Some(ProtPos { + aa: deletion.chars().next().unwrap().to_string(), + number: *start, + }); + if !insertion.is_empty() { + // delins + aa_end = if end > *start { + Some(ProtPos { + aa: deletion.chars().last().unwrap().to_string(), + number: end, + }) + } else { + aa_start.clone() + }; + alternative = insertion.clone(); + } else { + // deletion OR stop codon at variant position + if deletion.len() as i32 + start == self.ref_seq().len() as i32 { + // stop codon at variant position + aa_end = aa_start.clone(); + reference = "".to_string(); + alternative = "*".to_string(); + is_sub = true; + } else { + // deletion + aa_end = if end > *start { + Some(ProtPos { + aa: deletion.chars().last().unwrap().to_string(), + number: end, + }) + } else { + aa_start.clone() + }; + alternative = "".to_string() + } + } + } else if deletion.is_empty() { + // insertion OR duplication OR extension + let dup_start; + (is_dup, dup_start) = self.check_if_ins_is_dup(*start, insertion); + + if is_dup { + // is duplication + let dup_end = dup_start + insertion.len() as i32 - 1; + aa_start = Some(ProtPos { + aa: insertion.chars().next().unwrap().to_string(), + number: dup_start, + }); + aa_end = Some(ProtPos { + aa: insertion.chars().last().unwrap().to_string(), + number: dup_end, + }); + reference = "".to_string(); + alternative = reference.clone(); + } else { + // is non-dup insertion + let start = start - 1; + let end = start + 1; + + aa_start = Some(ProtPos { + aa: self + .ref_seq() + .chars() + .nth(start as usize - 1) + .unwrap() + .to_string(), + number: start, + }); + aa_end = Some(ProtPos { + aa: self + .ref_seq() + .chars() + .nth(end as usize - 1) + .unwrap() + .to_string(), + number: end, + }); + reference = "".to_string(); + alternative = insertion.clone(); + } + } else { + panic!("Unexpected variant: {:?}", &record); + } + + self.create_variant( + aa_start, + aa_end, + &reference, + &alternative, + fsext_len, + is_dup, + protein_accession, + is_ambiguous, + is_sub, + is_ext, + false, + is_init_met, + *is_frameshift, + ) + } + + #[allow(clippy::too_many_arguments)] + fn create_variant( + &self, + start: Option, + end: Option, + reference: &str, + alternative: &str, + fsext_len: UncertainLengthChange, + is_dup: bool, + acc: &str, + is_ambiguous: bool, + is_sub: bool, + is_ext: bool, + is_no_protein: bool, + is_init_met: bool, + is_frameshift: bool, + ) -> Result { + assert!(start.is_some() == end.is_some()); + + let loc_edit = if is_init_met { + ProtLocEdit::InitiationUncertain + } else if is_ambiguous { + ProtLocEdit::Unknown + } else if start.is_none() && !is_no_protein { + ProtLocEdit::NoChange + } else if is_no_protein { + ProtLocEdit::NoProteinUncertain + } else { + let interval = ProtInterval { + start: start.expect("must provide start"), + end: end.expect("must provide end"), + }; + // NB: order matters. + // NB: in the original Python module, it is configurable whether the result + // of protein prediction is certain or uncertain by a global configuration + // variable. + ProtLocEdit::Ordinary { + loc: Mu::Certain(interval), + edit: Mu::Certain(if is_sub { + ProteinEdit::Subst { + alternative: alternative.to_string(), + } + } else if is_ext { + ProteinEdit::Ext { + aa_ext: Some(alternative.to_string()), + ext_aa: Some("*".to_string()), + change: fsext_len, + } + } else if is_frameshift { + ProteinEdit::Fs { + alternative: Some(alternative.to_string()), + terminal: Some("*".to_owned()), + length: fsext_len, + } + } else if is_dup { + ProteinEdit::Dup + } else if reference.is_empty() == alternative.is_empty() { + if reference.len() > 1 || alternative.len() > 1 { + ProteinEdit::DelIns { + alternative: alternative.to_string(), + } + } else { + ProteinEdit::Subst { + alternative: alternative.to_string(), + } + } + } else if alternative.is_empty() { + ProteinEdit::Del + } else { + ProteinEdit::Ins { + alternative: alternative.to_string(), + } + }), + } + }; + + Ok(HgvsVariant::ProtVariant { + accession: Accession::new(acc), + gene_symbol: None, + loc_edit, + }) + } + + /// Helper to identity an insertion as a duplicate. + fn check_if_ins_is_dup(&self, start: i32, insertion: &str) -> (bool, i32) { + let dup_cand_start = start as usize - insertion.len() - 1; + let dup_cand = &self.ref_seq()[dup_cand_start..dup_cand_start + insertion.len()]; + if insertion == dup_cand { + (true, dup_cand_start as i32 + 1) + } else { + (false, -1) + } + } +} diff --git a/src/mapper/mod.rs b/src/mapper/mod.rs index 87eca1b..bf95be7 100644 --- a/src/mapper/mod.rs +++ b/src/mapper/mod.rs @@ -1,3 +1,4 @@ pub mod alignment; +pub(crate) mod altseq; pub mod cigar; pub mod variant; diff --git a/src/mapper/variant.rs b/src/mapper/variant.rs index 80f8e9b..0bf9cd5 100644 --- a/src/mapper/variant.rs +++ b/src/mapper/variant.rs @@ -12,61 +12,11 @@ use crate::{ Accession, CdsInterval, CdsLocEdit, CdsPos, GeneSymbol, GenomeInterval, GenomeLocEdit, HgvsVariant, Mu, NaEdit, TxInterval, TxLocEdit, TxPos, }, - utils::revcomp, + sequences::revcomp, validator::{ValidationLevel, Validator}, }; use std::ops::Deref; -/// Implementation details of `Mapper::c_to_p` -mod c_to_p_impl { - use std::rc::Rc; - - use crate::{data::interface::Provider, parser::HgvsVariant}; - - pub struct RefTranscriptData { - /// Data provider. - pub provider: Rc, - /// Transcript accession. - pub tx_ac: String, - /// Protein accession. - pub prot_ac: Option, - } - - pub struct AltSeqBuilder<'a> { - pub reference_data: &'a RefTranscriptData, - } - - pub struct AltData {} - - impl<'a> AltSeqBuilder<'a> { - pub fn new(reference_data: &'a RefTranscriptData) -> Self { - Self { reference_data } - } - - pub fn build_altseq(&self) -> Result, anyhow::Error> { - todo!() - } - } - - pub struct AltSeqToHgvsp<'a> { - pub var_c: &'a HgvsVariant, - pub reference_data: &'a RefTranscriptData, - } - - impl<'a> AltSeqToHgvsp<'a> { - pub fn new(var_c: &'a HgvsVariant, reference_data: &'a RefTranscriptData) -> Self { - Self { - var_c, - reference_data, - } - } - - pub fn build_hgvsp(&self) -> Result { - todo!() - } - } -} - /// Configuration for Mapper. /// /// Defaults are taken from `hgvs` Python library. @@ -551,7 +501,7 @@ impl Mapper { }; let var_g = HgvsVariant::GenomeVariant { - accession: accession.clone(), + accession: Accession::from(alt_ac.to_string()), gene_symbol: self.fetch_gene_symbol(accession.deref().as_str(), gene_symbol)?, loc_edit: GenomeLocEdit { loc: pos_g, @@ -698,7 +648,7 @@ impl Mapper { var_c: &HgvsVariant, prot_ac: Option<&str>, ) -> Result { - use c_to_p_impl::*; + use super::altseq::*; if let HgvsVariant::CdsVariant { accession, @@ -708,12 +658,12 @@ impl Mapper { { self.validator.validate(var_c)?; - let reference_data = RefTranscriptData { - provider: self.provider.clone(), - tx_ac: accession.deref().clone(), - prot_ac: prot_ac.map(|s| s.to_string()), - }; - let builder = AltSeqBuilder::new(&reference_data); + let reference_data = RefTranscriptData::new( + self.provider.clone(), + accession.deref(), + prot_ac.map(|s| s.to_string()).as_deref(), + )?; + let builder = AltSeqBuilder::new(var_c.clone(), reference_data.clone()); // NB: the following comment is from the original code. // TODO: handle case where you get 2+ alt sequences back; currently get lis tof 1 element @@ -722,8 +672,8 @@ impl Mapper { let var_ps: Result, anyhow::Error> = builder .build_altseq()? .into_iter() - .map(|_alt_data| { - let builder = AltSeqToHgvsp::new(var_c, &reference_data); + .map(|alt_data| { + let builder = AltSeqToHgvsp::new(reference_data.clone(), alt_data); builder.build_hgvsp() }) .collect(); @@ -951,27 +901,19 @@ impl Mapper { #[cfg(test)] mod test { - use std::{rc::Rc, str::FromStr}; + use pretty_assertions::assert_eq; + use std::str::FromStr; use test_log::test; use crate::{ - data::uta::{Config as ProviderConfig, Provider}, - parser::HgvsVariant, + data::uta_sr::test_helpers::build_provider, + parser::{HgvsVariant, NoRef}, }; use super::{Config, Mapper}; - fn get_config() -> ProviderConfig { - ProviderConfig { - db_url: std::env::var("TEST_UTA_DATABASE_URL") - .expect("Environment variable TEST_UTA_DATABASE_URL undefined!"), - db_schema: std::env::var("TEST_UTA_DATABASE_SCHEMA") - .expect("Environment variable TEST_UTA_DATABASE_SCHEMA undefined!"), - } - } - fn build_mapper() -> Result { - let provider = Rc::new(Provider::with_config(&get_config())?); + let provider = build_provider()?; let config = Config::default(); Ok(Mapper::new(&config, provider)) } @@ -1010,7 +952,6 @@ mod test { Ok(()) } - #[ignore] #[test] fn fail_on_undefined_cds() -> Result<(), anyhow::Error> { let mapper = build_mapper()?; @@ -1020,12 +961,6 @@ mod test { let var_n = HgvsVariant::from_str(hgvs_n)?; let var_c = HgvsVariant::from_str(hgvs_c)?; - let _tx_ac = if let HgvsVariant::TxVariant { accession, .. } = &var_c { - Some(accession.value.clone()) - } else { - None - } - .unwrap(); // n_to_c: transcript is non-coding assert!(mapper.n_to_c(&var_n).is_err()); @@ -1036,7 +971,6 @@ mod test { Ok(()) } - #[ignore] #[test] fn map_var_of_unsupported_validation() -> Result<(), anyhow::Error> { let mapper = build_mapper()?; @@ -1044,12 +978,14 @@ mod test { let var_c = HgvsVariant::from_str(hgvs_c)?; let var_g = mapper.c_to_g(&var_c, "NC_000007.13", "splign")?; - assert_eq!(format!("{}", &var_g), "NC_000007.13:g.21940852_21940908del"); + assert_eq!( + format!("{}", &NoRef(&var_g)), + "NC_000007.13:g.21940852_21940908del" + ); Ok(()) } - #[ignore] #[test] fn map_to_unknown_p_effect() -> Result<(), anyhow::Error> { let mapper = build_mapper()?; @@ -1061,36 +997,567 @@ mod test { Ok(()) } - #[ignore] - #[test] - fn map_of_c_out_of_cds_bound() -> Result<(), anyhow::Error> { - let mapper = build_mapper()?; - let hgvs_c = "NM_145901.2:c.343T>C"; // gene HMGA1 - let var_c = HgvsVariant::from_str(hgvs_c)?; - assert!(mapper.c_to_p(&var_c, None).is_err()); + // TODO(#17): Need to implement validation. + // #[test] + // fn map_of_c_out_of_cds_bound() -> Result<(), anyhow::Error> { + // let mapper = build_mapper()?; + // let hgvs_c = "NM_145901.2:c.343T>C"; // gene HMGA1 + // let var_c = HgvsVariant::from_str(hgvs_c)?; + // assert!(mapper.c_to_p(&var_c, None).is_err()); - Ok(()) - } + // Ok(()) + // } - #[ignore] #[test] fn map_of_dup_at_cds_end() -> Result<(), anyhow::Error> { let mapper = build_mapper()?; let hgvs_c = "NM_001051.2:c.1257dupG"; // gene SSTR3 let var_c = HgvsVariant::from_str(hgvs_c)?; let var_p = mapper.c_to_p(&var_c, None)?; - assert_eq!(format!("{}", &var_p), "NP_001042.1:p.(=)"); + assert_eq!(format!("{}", &var_p), "NP_001042.1:p.="); + + Ok(()) + } + + // TODO(#17): Need to implement validation. + // #[test] + // fn map_of_c_out_of_reference_bound() -> Result<(), anyhow::Error> { + // let mapper = build_mapper()?; + // let hgvs_c = "NM_000249.3:c.-73960_*46597del"; // gene MLH1 + // let var_c = HgvsVariant::from_str(hgvs_c)?; + // assert!(mapper.c_to_p(&var_c, None).is_err()); + + // Ok(()) + // } + + /// The following tests corresponds to the `test_hgvs_variantmapper_cp_sanity.py` + /// test suite of the Python package. It uses a mock data provider, defined + /// in the `sanity_mock` module. + + mod sanity_mock { + use std::{ + path::{Path, PathBuf}, + rc::Rc, + }; + + use crate::data::interface::Provider as ProviderInterface; + use crate::{ + data::interface::TxIdentityInfo, + mapper::variant::{Config, Mapper}, + }; + + #[derive(Debug, serde::Deserialize)] + struct ProviderRecord { + pub accession: String, + pub transcript_sequence: String, + pub cds_start_i: i32, + pub cds_end_i: i32, + } + + pub struct Provider { + records: Vec, + } + + impl Provider { + pub fn new(path: &Path) -> Result { + let mut records = Vec::new(); + + let mut rdr = csv::ReaderBuilder::new() + .delimiter(b'\t') + .has_headers(true) + .from_path(path)?; + for record in rdr.deserialize() { + records.push(record?); + } + + Ok(Self { records }) + } + } + + impl ProviderInterface for Provider { + fn data_version(&self) -> &str { + panic!("for test use only"); + } + + fn schema_version(&self) -> &str { + panic!("for test use only"); + } + + fn get_assembly_map( + &self, + _assembly: crate::static_data::Assembly, + ) -> linked_hash_map::LinkedHashMap { + panic!("for test use only"); + } + + fn get_gene_info( + &self, + _hgnc: &str, + ) -> Result { + panic!("for test use only"); + } + + fn get_pro_ac_for_tx_ac(&self, _tx_ac: &str) -> Result, anyhow::Error> { + panic!("for test use only"); + } + + fn get_seq_part( + &self, + tx_ac: &str, + begin: Option, + end: Option, + ) -> Result { + for record in &self.records { + if record.accession == tx_ac { + let seq = &record.transcript_sequence; + return match (begin, end) { + (None, None) => Ok(seq.to_string()), + (None, Some(end)) => Ok(seq[..end].to_string()), + (Some(begin), None) => Ok(seq[begin..].to_string()), + (Some(begin), Some(end)) => Ok(seq[begin..end].to_string()), + }; + } + } + Err(anyhow::anyhow!("Found no record for accession {}", &tx_ac)) + } + + fn get_acs_for_protein_seq(&self, _seq: &str) -> Result, anyhow::Error> { + panic!("for test use only"); + } + + fn get_similar_transcripts( + &self, + _tx_ac: &str, + ) -> Result, anyhow::Error> + { + panic!("for test use only"); + } + + fn get_tx_exons( + &self, + _tx_ac: &str, + _alt_ac: &str, + _alt_aln_method: &str, + ) -> Result, anyhow::Error> { + todo!() + } + + fn get_tx_for_gene( + &self, + _gene: &str, + ) -> Result, anyhow::Error> { + panic!("for test use only"); + } + + fn get_tx_for_region( + &self, + _alt_ac: &str, + _alt_aln_method: &str, + _start_i: i32, + _end_i: i32, + ) -> Result, anyhow::Error> { + panic!("for test use only"); + } + + fn get_tx_identity_info(&self, tx_ac: &str) -> Result { + for record in &self.records { + if record.accession == tx_ac { + return Ok(TxIdentityInfo { + tx_ac: record.accession.clone(), + alt_ac: record.accession.clone(), + alt_aln_method: "splign".to_string(), + cds_start_i: record.cds_start_i, + cds_end_i: record.cds_end_i, + lengths: Vec::new(), + hgnc: "MOCK".to_string(), + }); + } + } + Err(anyhow::anyhow!("Found no record for accession {}", &tx_ac)) + } + + fn get_tx_info( + &self, + _tx_ac: &str, + _alt_ac: &str, + _alt_aln_method: &str, + ) -> Result { + panic!("for test use only"); + } + + fn get_tx_mapping_options( + &self, + _tx_ac: &str, + ) -> Result, anyhow::Error> + { + panic!("for test use only"); + } + } + + pub fn build_mapper() -> Result { + let path = PathBuf::from("tests/data/mapper/sanity_cp.tsv"); + let provider = Rc::new(Provider::new(&path)?); + let config = Config::default(); + Ok(Mapper::new(&config, provider)) + } + } + + fn test_hgvs_c_to_p_conversion(hgvsc: &str, hgvsp_expected: &str) -> Result<(), anyhow::Error> { + let mapper = sanity_mock::build_mapper()?; + + let var_c = HgvsVariant::from_str(hgvsc)?; + let ac_p = "MOCK"; + + let var_p = mapper.c_to_p(&var_c, Some(ac_p))?; + let hgvsp_actual = format!("{}", &var_p); + + assert_eq!(hgvsp_actual, hgvsp_expected); Ok(()) } - #[ignore] #[test] - fn map_of_c_out_of_reference_bound() -> Result<(), anyhow::Error> { - let mapper = build_mapper()?; - let hgvs_c = "NM_000249.3:c.-73960_*46597del"; // gene MLH1 - let var_c = HgvsVariant::from_str(hgvs_c)?; - assert!(mapper.c_to_p(&var_c, None).is_err()); + fn hgvs_c_to_p_silent() -> Result<(), anyhow::Error> { + let hgvsc = "NM_999999.1:c.6A>G"; + let hgvsp_expected = "MOCK:p.Lys2="; + test_hgvs_c_to_p_conversion(hgvsc, hgvsp_expected)?; + + Ok(()) + } + + #[test] + fn hgvs_c_to_p_substitution() -> Result<(), anyhow::Error> { + let hgvsc = "NM_999999.1:c.6A>T"; + let hgvsp_expected = "MOCK:p.Lys2Asn"; + test_hgvs_c_to_p_conversion(hgvsc, hgvsp_expected)?; + + Ok(()) + } + + #[test] + fn hgvs_c_to_p_substitution_introduces_stop_codon() -> Result<(), anyhow::Error> { + let hgvsc = "NM_999996.1:c.8C>A"; + let hgvsp_expected = "MOCK:p.Ser3Ter"; + test_hgvs_c_to_p_conversion(hgvsc, hgvsp_expected)?; + + Ok(()) + } + + #[test] + fn hgvs_c_to_p_substitution_removes_stop_codon() -> Result<(), anyhow::Error> { + let hgvsc = "NM_999998.1:c.30G>T"; + let hgvsp_expected = "MOCK:p.Ter10TyrextTer3"; + test_hgvs_c_to_p_conversion(hgvsc, hgvsp_expected)?; + + Ok(()) + } + + //xx + #[test] + fn hgvs_c_to_p_insertion_no_frameshift() -> Result<(), anyhow::Error> { + let hgvsc = "NM_999999.1:c.6_7insGGG"; + let hgvsp_expected = "MOCK:p.Lys2_Ala3insGly"; + test_hgvs_c_to_p_conversion(hgvsc, hgvsp_expected)?; + + Ok(()) + } + + #[test] + fn hgvs_c_to_p_insertion_frameshift() -> Result<(), anyhow::Error> { + let hgvsc = "NM_999999.1:c.22_23insT"; + let hgvsp_expected = "MOCK:p.Ala8ValfsTer?"; + test_hgvs_c_to_p_conversion(hgvsc, hgvsp_expected)?; + + Ok(()) + } + + #[test] + fn hgvs_c_to_p_adds_stop() -> Result<(), anyhow::Error> { + let hgvsc = "NM_999999.1:c.8_9insTT"; + let hgvsp_expected = "MOCK:p.Lys4Ter"; + test_hgvs_c_to_p_conversion(hgvsc, hgvsp_expected)?; + + Ok(()) + } + + #[test] + fn hgvs_c_to_p_deletion_no_frameshift() -> Result<(), anyhow::Error> { + let hgvsc = "NM_999999.1:c.10_12del"; + let hgvsp_expected = "MOCK:p.Lys4del"; + test_hgvs_c_to_p_conversion(hgvsc, hgvsp_expected)?; + + Ok(()) + } + + #[test] + fn hgvs_c_to_p_deletion2_no_frameshift() -> Result<(), anyhow::Error> { + let hgvsc = "NM_999999.1:c.4_15del"; + let hgvsp_expected = "MOCK:p.Lys2_Ala5del"; + test_hgvs_c_to_p_conversion(hgvsc, hgvsp_expected)?; + + Ok(()) + } + + #[test] + fn hgvs_c_to_p_deletion3_no_frameshift_c_term() -> Result<(), anyhow::Error> { + let hgvsc = "NM_999995.1:c.4_6del"; + let hgvsp_expected = "MOCK:p.Lys3del"; + test_hgvs_c_to_p_conversion(hgvsc, hgvsp_expected)?; + + Ok(()) + } + + #[test] + fn hgvs_c_to_p_deletion4_no_frameshift_c_term() -> Result<(), anyhow::Error> { + let hgvsc = "NM_999994.1:c.4_9del"; + let hgvsp_expected = "MOCK:p.Lys3_Lys4del"; + test_hgvs_c_to_p_conversion(hgvsc, hgvsp_expected)?; + + Ok(()) + } + + #[test] + fn hgvs_c_to_p_deletion5_no_frameshift() -> Result<(), anyhow::Error> { + let hgvsc = "NM_999994.1:c.20_25del"; + let hgvsp_expected = "MOCK:p.Ala7_Arg9delinsGly"; + test_hgvs_c_to_p_conversion(hgvsc, hgvsp_expected)?; + + Ok(()) + } + + #[test] + fn hgvs_c_to_p_deletion6_no_frameshift() -> Result<(), anyhow::Error> { + let hgvsc = "NM_999999.1:c.5_7del"; + let hgvsp_expected = "MOCK:p.Lys2_Ala3delinsThr"; + test_hgvs_c_to_p_conversion(hgvsc, hgvsp_expected)?; + + Ok(()) + } + + #[test] + fn hgvs_c_to_p_deletion7_no_frameshift() -> Result<(), anyhow::Error> { + let hgvsc = "NM_999993.1:c.13_24del"; + let hgvsp_expected = "MOCK:p.Arg5_Ala8del"; + test_hgvs_c_to_p_conversion(hgvsc, hgvsp_expected)?; + + Ok(()) + } + + #[test] + fn hgvs_c_to_p_deletion_frameshift_nostop() -> Result<(), anyhow::Error> { + let hgvsc = "NM_999999.1:c.11_12del"; + let hgvsp_expected = "MOCK:p.Lys4SerfsTer?"; + test_hgvs_c_to_p_conversion(hgvsc, hgvsp_expected)?; + + Ok(()) + } + + #[test] + fn hgvs_c_to_p_deletion_frameshift_adds_stop() -> Result<(), anyhow::Error> { + let hgvsc = "NM_999997.1:c.7del"; + let hgvsp_expected = "MOCK:p.Ala3ArgfsTer6"; + test_hgvs_c_to_p_conversion(hgvsc, hgvsp_expected)?; + + Ok(()) + } + + #[test] + fn hgvs_c_to_p_deletion_no_frameshift_removes_stop_plus_previous() -> Result<(), anyhow::Error> + { + let hgvsc = "NM_999999.1:c.25_30del"; + let hgvsp_expected = "MOCK:p.Lys9_Ter10delinsGly"; + test_hgvs_c_to_p_conversion(hgvsc, hgvsp_expected)?; + + Ok(()) + } + + #[test] + fn hgvs_c_to_p_indel_no_frameshift() -> Result<(), anyhow::Error> { + let hgvsc = "NM_999999.1:c.11_12delinsTCCCA"; + let hgvsp_expected = "MOCK:p.Lys4delinsIlePro"; + test_hgvs_c_to_p_conversion(hgvsc, hgvsp_expected)?; + + Ok(()) + } + + #[test] + fn hgvs_c_to_p_indel2_no_frameshift() -> Result<(), anyhow::Error> { + let hgvsc = "NM_999999.1:c.11_18delinsTCCCA"; + let hgvsp_expected = "MOCK:p.Lys4_Phe6delinsIlePro"; + test_hgvs_c_to_p_conversion(hgvsc, hgvsp_expected)?; + + Ok(()) + } + + #[test] + fn hgvs_c_to_p_indel_frameshift_nostop() -> Result<(), anyhow::Error> { + let hgvsc = "NM_999999.1:c.8delinsGG"; + let hgvsp_expected = "MOCK:p.Ala3GlyfsTer?"; + test_hgvs_c_to_p_conversion(hgvsc, hgvsp_expected)?; + + Ok(()) + } + + #[test] + fn hgvs_c_to_p_dup_1aa_no_frameshift_2() -> Result<(), anyhow::Error> { + let hgvsc = "NM_999999.1:c.10_12dup"; + let hgvsp_expected = "MOCK:p.Lys4dup"; + test_hgvs_c_to_p_conversion(hgvsc, hgvsp_expected)?; + + Ok(()) + } + + #[test] + fn hgvs_c_to_p_dup_1aa_no_frameshift() -> Result<(), anyhow::Error> { + let hgvsc = "NM_999999.1:c.16_18dup"; + let hgvsp_expected = "MOCK:p.Phe6dup"; + test_hgvs_c_to_p_conversion(hgvsc, hgvsp_expected)?; + + Ok(()) + } + + #[test] + fn hgvs_c_to_p_dup_2aa_no_frameshift() -> Result<(), anyhow::Error> { + let hgvsc = "NM_999999.1:c.16_21dup"; + let hgvsp_expected = "MOCK:p.Phe6_Arg7dup"; + test_hgvs_c_to_p_conversion(hgvsc, hgvsp_expected)?; + + Ok(()) + } + + #[test] + fn hgvs_c_to_p_dup_2aa2_no_frameshift() -> Result<(), anyhow::Error> { + let hgvsc = "NM_999995.1:c.4_6dup"; + let hgvsp_expected = "MOCK:p.Lys3dup"; + test_hgvs_c_to_p_conversion(hgvsc, hgvsp_expected)?; + + Ok(()) + } + + #[test] + fn hgvs_c_to_p_3aa_no_frameshift() -> Result<(), anyhow::Error> { + let hgvsc = "NM_999999.1:c.16_24dup"; + let hgvsp_expected = "MOCK:p.Phe6_Ala8dup"; + test_hgvs_c_to_p_conversion(hgvsc, hgvsp_expected)?; + + Ok(()) + } + + #[test] + fn hgvs_c_to_p_dup_frameshift() -> Result<(), anyhow::Error> { + let hgvsc = "NM_999999.1:c.12_13dup"; + let hgvsp_expected = "MOCK:p.Ala5GlufsTer?"; + test_hgvs_c_to_p_conversion(hgvsc, hgvsp_expected)?; + + Ok(()) + } + + #[test] + fn hgvs_c_to_p_intron() -> Result<(), anyhow::Error> { + let hgvsc = "NM_999999.1:c.12+1G>A"; + let hgvsp_expected = "MOCK:p.?"; + test_hgvs_c_to_p_conversion(hgvsc, hgvsp_expected)?; + + Ok(()) + } + + #[test] + fn hgvs_c_to_p_five_prime_utr() -> Result<(), anyhow::Error> { + let hgvsc = "NM_999999.1:c.-2A>G"; + let hgvsp_expected = "MOCK:p.?"; + test_hgvs_c_to_p_conversion(hgvsc, hgvsp_expected)?; + + Ok(()) + } + + #[test] + fn hgvs_c_to_p_three_prime_utr() -> Result<(), anyhow::Error> { + let hgvsc = "NM_999999.1:c.*3G>A"; + let hgvsp_expected = "MOCK:p.?"; + test_hgvs_c_to_p_conversion(hgvsc, hgvsp_expected)?; + + Ok(()) + } + + #[test] + fn hgvs_c_to_p_deletion_into_three_prime_utr_frameshift() -> Result<(), anyhow::Error> { + let hgvsc = "NM_999999.1:c.27_*3del"; + let hgvsp_expected = "MOCK:p.Lys9XaafsTer?"; + test_hgvs_c_to_p_conversion(hgvsc, hgvsp_expected)?; + + Ok(()) + } + + #[test] + fn hgvs_c_to_p_deletion_into_three_prime_utr_no_frameshift() -> Result<(), anyhow::Error> { + let hgvsc = "NM_999995.1:c.28_*3del"; + let hgvsp_expected = "MOCK:p.Lys10_Ter11delinsArgGlnPheArg"; + test_hgvs_c_to_p_conversion(hgvsc, hgvsp_expected)?; + + Ok(()) + } + + #[test] + fn hgvs_c_to_p_delins_into_three_prime_utr_no_frameshift() -> Result<(), anyhow::Error> { + let hgvsc = "NM_999995.1:c.28_*3delinsGGG"; + let hgvsp_expected = "MOCK:p.Lys10_Ter11delinsGlyArgGlnPheArg"; + test_hgvs_c_to_p_conversion(hgvsc, hgvsp_expected)?; + + Ok(()) + } + + /// See recommendations re p.? (p.Met1?) at: + /// http://varnomen.hgvs.org/recommendations/protein/variant/substitution/ + #[test] + fn hgvs_c_to_p_substitution_removes_start_codon() -> Result<(), anyhow::Error> { + let hgvsc = "NM_999999.1:c.1A>G"; + let hgvsp_expected = "MOCK:p.Met1?"; + test_hgvs_c_to_p_conversion(hgvsc, hgvsp_expected)?; + + Ok(()) + } + + #[test] + fn hgvs_c_to_p_deletion_from_five_prime_utr_frameshift() -> Result<(), anyhow::Error> { + let hgvsc = "NM_999999.1:c.-3_1del"; + let hgvsp_expected = "MOCK:p.Met1?"; + test_hgvs_c_to_p_conversion(hgvsc, hgvsp_expected)?; + + Ok(()) + } + + #[test] + fn hgvs_c_to_p_deletion_from_five_prime_utr_no_frameshift() -> Result<(), anyhow::Error> { + let hgvsc = "NM_999999.1:c.-3_3del"; + let hgvsp_expected = "MOCK:p.Met1?"; + test_hgvs_c_to_p_conversion(hgvsc, hgvsp_expected)?; + + Ok(()) + } + + #[test] + fn hgvs_c_to_p_delins_from_five_prime_utr_no_frameshift() -> Result<(), anyhow::Error> { + let hgvsc = "NM_999999.1:c.-3_3delinsAAA"; + let hgvsp_expected = "MOCK:p.Met1?"; + test_hgvs_c_to_p_conversion(hgvsc, hgvsp_expected)?; + + Ok(()) + } + + #[test] + fn hgvs_c_to_p_delete_entire_gene() -> Result<(), anyhow::Error> { + let hgvsc = "NM_999999.1:c.-3_*1del"; + let hgvsp_expected = "MOCK:p.0?"; + test_hgvs_c_to_p_conversion(hgvsc, hgvsp_expected)?; + + Ok(()) + } + + #[test] + fn hgvs_c_to_p_multiple_stop_codons() -> Result<(), anyhow::Error> { + let hgvsc = "NM_999992.1:c.4G>A"; + let hgvsp_expected = "MOCK:p.?"; + test_hgvs_c_to_p_conversion(hgvsc, hgvsp_expected)?; Ok(()) } diff --git a/src/normalizer.rs b/src/normalizer.rs index f4aa97f..13cfd8a 100644 --- a/src/normalizer.rs +++ b/src/normalizer.rs @@ -9,7 +9,7 @@ use crate::{ GenomeInterval, GenomeLocEdit, HgvsVariant, MtInterval, MtLocEdit, Mu, NaEdit, RnaInterval, RnaLocEdit, RnaPos, TxInterval, TxLocEdit, TxPos, }, - utils::{revcomp, trim_common_prefixes, trim_common_suffixes}, + sequences::{revcomp, trim_common_prefixes, trim_common_suffixes}, validator::Validator, }; diff --git a/src/parser/display.rs b/src/parser/display.rs index 3e1b585..ae8b289 100644 --- a/src/parser/display.rs +++ b/src/parser/display.rs @@ -6,7 +6,7 @@ use std::fmt::Display; -use crate::parser::ds::*; +use crate::{parser::ds::*, sequences::aa_to_aa3}; /// Newtype that allows to suppress printing of reference bases. pub struct NoRef<'a, T>(pub &'a T) @@ -139,20 +139,34 @@ impl Display for ProteinEdit { (Some(alt), None, UncertainLengthChange::None) => write!(f, "{alt}fs"), (Some(alt), None, UncertainLengthChange::Unknown) => write!(f, "{alt}fs?"), (Some(alt), None, UncertainLengthChange::Known(count)) => { + let alt = aa_to_aa3(alt).expect("aa_to_aa3 conversion failed"); write!(f, "{alt}fs{count}") } - (None, Some(ter), UncertainLengthChange::None) => write!(f, "fs{ter}"), - (None, Some(ter), UncertainLengthChange::Unknown) => write!(f, "fs{ter}?"), + (None, Some(ter), UncertainLengthChange::None) => { + let ter = aa_to_aa3(ter).expect("aa_to_aa3 conversion failed"); + write!(f, "fs{ter}") + } + (None, Some(ter), UncertainLengthChange::Unknown) => { + let ter = aa_to_aa3(ter).expect("aa_to_aa3 conversion failed"); + write!(f, "fs{ter}?") + } (None, Some(ter), UncertainLengthChange::Known(count)) => { + let ter = aa_to_aa3(ter).expect("aa_to_aa3 conversion failed"); write!(f, "fs{ter}{count}") } (Some(alt), Some(ter), UncertainLengthChange::None) => { + let alt = aa_to_aa3(alt).expect("aa_to_aa3 conversion failed"); + let ter = aa_to_aa3(ter).expect("aa_to_aa3 conversion failed"); write!(f, "{alt}fs{ter}") } (Some(alt), Some(ter), UncertainLengthChange::Unknown) => { + let alt = aa_to_aa3(alt).expect("aa_to_aa3 conversion failed"); + let ter = aa_to_aa3(ter).expect("aa_to_aa3 conversion failed"); write!(f, "{alt}fs{ter}?") } (Some(alt), Some(ter), UncertainLengthChange::Known(count)) => { + let alt = aa_to_aa3(alt).expect("aa_to_aa3 conversion failed"); + let ter = aa_to_aa3(ter).expect("aa_to_aa3 conversion failed"); write!(f, "{alt}fs{ter}{count}") } }, @@ -164,29 +178,56 @@ impl Display for ProteinEdit { (None, None, UncertainLengthChange::None) => write!(f, "ext"), (None, None, UncertainLengthChange::Unknown) => write!(f, "ext?"), (None, None, UncertainLengthChange::Known(count)) => write!(f, "ext{count}"), - (Some(alt), None, UncertainLengthChange::None) => write!(f, "{alt}ext"), - (Some(alt), None, UncertainLengthChange::Unknown) => write!(f, "{alt}ext?"), + (Some(alt), None, UncertainLengthChange::None) => { + let alt = aa_to_aa3(alt).expect("aa_to_aa3 conversion failed"); + write!(f, "{alt}ext") + } + (Some(alt), None, UncertainLengthChange::Unknown) => { + let alt = aa_to_aa3(alt).expect("aa_to_aa3 conversion failed"); + write!(f, "{alt}ext?") + } (Some(alt), None, UncertainLengthChange::Known(count)) => { + let alt = aa_to_aa3(alt).expect("aa_to_aa3 conversion failed"); write!(f, "{alt}ext{count}") } (None, Some(ter), UncertainLengthChange::None) => write!(f, "ext{ter}"), (None, Some(ter), UncertainLengthChange::Unknown) => write!(f, "ext{ter}?"), (None, Some(ter), UncertainLengthChange::Known(count)) => { + let ter = aa_to_aa3(ter).expect("aa_to_aa3 conversion failed"); write!(f, "ext{ter}{count}") } (Some(alt), Some(ter), UncertainLengthChange::None) => { + let alt = aa_to_aa3(alt).expect("aa_to_aa3 conversion failed"); + let ter = aa_to_aa3(ter).expect("aa_to_aa3 conversion failed"); write!(f, "{alt}ext{ter}") } (Some(alt), Some(ter), UncertainLengthChange::Unknown) => { + let alt = aa_to_aa3(alt).expect("aa_to_aa3 conversion failed"); + let ter = aa_to_aa3(ter).expect("aa_to_aa3 conversion failed"); write!(f, "{alt}ext{ter}?") } (Some(alt), Some(ter), UncertainLengthChange::Known(count)) => { + let alt = aa_to_aa3(alt).expect("aa_to_aa3 conversion failed"); + let ter = aa_to_aa3(ter).expect("aa_to_aa3 conversion failed"); write!(f, "{alt}ext{ter}{count}") } }, - ProteinEdit::Subst { alternative } => write!(f, "{alternative}"), - ProteinEdit::DelIns { alternative } => write!(f, "delins{alternative}"), - ProteinEdit::Ins { alternative } => write!(f, "ins{alternative}"), + ProteinEdit::Subst { alternative } => { + let alternative = aa_to_aa3(alternative).expect("aa_to_aa3 conversion failed"); + if alternative.is_empty() { + write!(f, "=") + } else { + write!(f, "{alternative}") + } + } + ProteinEdit::DelIns { alternative } => { + let alternative = aa_to_aa3(alternative).expect("aa_to_aa3 conversion failed"); + write!(f, "delins{alternative}") + } + ProteinEdit::Ins { alternative } => { + let alternative = aa_to_aa3(alternative).expect("aa_to_aa3 conversion failed"); + write!(f, "ins{alternative}") + } ProteinEdit::Del => write!(f, "del"), ProteinEdit::Dup => write!(f, "dup"), ProteinEdit::Ident => write!(f, "="), @@ -196,7 +237,8 @@ impl Display for ProteinEdit { impl Display for ProtPos { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}{}", self.aa, self.number) + let aa = aa_to_aa3(&self.aa).expect("aa_to_aa3 conversion failed"); + write!(f, "{aa}{}", self.number) } } @@ -212,12 +254,15 @@ impl Display for ProtInterval { impl Display for ProtLocEdit { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + // TODO: make configurable whether inferred protein is uncertain or not? match self { ProtLocEdit::Ordinary { loc, edit } => write!(f, "{loc}{edit}"), ProtLocEdit::NoChange => write!(f, "="), ProtLocEdit::NoChangeUncertain => write!(f, "(=)"), ProtLocEdit::NoProtein => write!(f, "0"), ProtLocEdit::NoProteinUncertain => write!(f, "0?"), + ProtLocEdit::Unknown => write!(f, "?"), + ProtLocEdit::InitiationUncertain => write!(f, "Met1?"), } } } diff --git a/src/parser/ds.rs b/src/parser/ds.rs index 2b5f830..a0b9091 100644 --- a/src/parser/ds.rs +++ b/src/parser/ds.rs @@ -146,8 +146,9 @@ impl NaEdit { } /// Uncertain change through extension. -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, Default)] pub enum UncertainLengthChange { + #[default] None, Unknown, Known(i32), @@ -159,7 +160,7 @@ pub struct Accession { pub value: String, } -impl lazy_static::__Deref for Accession { +impl Deref for Accession { type Target = String; fn deref(&self) -> &Self::Target { @@ -726,6 +727,10 @@ pub enum ProtLocEdit { NoProtein, /// `0?` NoProteinUncertain, + /// `?` + Unknown, + /// `Met1?` + InitiationUncertain, } /// Protein position interval. @@ -748,7 +753,7 @@ impl From for Range { } /// Protein position. -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, Default)] pub struct ProtPos { /// Amino acid value. pub aa: String, diff --git a/src/parser/impl_parse.rs b/src/parser/impl_parse.rs index 04bbb8a..d8e01bf 100644 --- a/src/parser/impl_parse.rs +++ b/src/parser/impl_parse.rs @@ -314,6 +314,10 @@ impl ProtLocEdit { map(tag("(=)"), |_| ProtLocEdit::NoChangeUncertain)(input) } + fn parse_unknown(input: &str) -> IResult<&str, Self> { + map(tag("?"), |_| ProtLocEdit::Unknown)(input) + } + fn parse_no_protein(input: &str) -> IResult<&str, Self> { map(tag("0"), |_| ProtLocEdit::NoProtein)(input) } @@ -338,6 +342,7 @@ impl Parseable for ProtLocEdit { Self::parse_no_protein, Self::parse_no_change_uncertain, Self::parse_no_change, + Self::parse_unknown, ))(input) } } diff --git a/src/sequences.rs b/src/sequences.rs new file mode 100644 index 0000000..002262e --- /dev/null +++ b/src/sequences.rs @@ -0,0 +1,928 @@ +//! Utility code for working with sequences. +//! +//! Partially ported over from `bioutils.sequences`. + +use md5::{Digest, Md5}; +use phf::phf_map; + +pub fn trim_common_prefixes(reference: &str, alternative: &str) -> (usize, String, String) { + if reference.is_empty() || alternative.is_empty() { + return (0, reference.to_string(), alternative.to_string()); + } + + let mut trim = 0; + while trim < reference.len() && trim < alternative.len() { + if reference.chars().nth(trim) != alternative.chars().nth(trim) { + break; + } + + trim += 1; + } + + ( + trim, + reference[trim..].to_string(), + alternative[trim..].to_string(), + ) +} + +pub fn trim_common_suffixes(reference: &str, alternative: &str) -> (usize, String, String) { + if reference.is_empty() || alternative.is_empty() { + return (0, reference.to_string(), alternative.to_string()); + } + + let mut trim = 0; + let mut i_r = reference.len(); + let mut i_a = alternative.len(); + let mut pad = 0; + while trim < reference.len() && trim < alternative.len() { + trim += 1; + assert!(i_r > 0); + assert!(i_a > 0); + i_r -= 1; + i_a -= 1; + + if reference.chars().nth(i_r) != alternative.chars().nth(i_a) { + pad = 1; + break; + } + } + + ( + trim - pad, + reference[..(i_r + pad)].to_string(), + alternative[..(i_a + pad)].to_string(), + ) +} + +/// Reverse complementing shortcut. +pub fn revcomp(seq: &str) -> String { + std::str::from_utf8(&bio::alphabets::dna::revcomp(seq.as_bytes())) + .unwrap() + .to_string() +} + +static AA3_TO_AA1_LUT: phf::Map<&'static str, &'static str> = phf_map! { + "Ala" => "A", + "Arg" => "R", + "Asn" => "N", + "Asp" => "D", + "Cys" => "C", + "Gln" => "Q", + "Glu" => "E", + "Gly" => "G", + "His" => "H", + "Ile" => "I", + "Leu" => "L", + "Lys" => "K", + "Met" => "M", + "Phe" => "F", + "Pro" => "P", + "Ser" => "S", + "Thr" => "T", + "Trp" => "W", + "Tyr" => "Y", + "Val" => "V", + "Xaa" => "X", + "Ter" => "*", + "Sec" => "U", +}; + +static AA1_TO_AA3_LUT: phf::Map<&'static str, &'static str> = phf_map! { + "A" => "Ala", + "R" => "Arg", + "N" => "Asn", + "D" => "Asp", + "C" => "Cys", + "Q" => "Gln", + "E" => "Glu", + "G" => "Gly", + "H" => "His", + "I" => "Ile", + "L" => "Leu", + "K" => "Lys", + "M" => "Met", + "F" => "Phe", + "P" => "Pro", + "S" => "Ser", + "T" => "Thr", + "W" => "Trp", + "Y" => "Tyr", + "V" => "Val", + "X" => "Xaa", + "*" => "Ter", + "U" => "Sec", +}; + +/// NCBI standard translation table. +static DNA_TO_AA1_LUT: phf::Map<&'static str, &'static str> = phf_map! { + "AAA" => "K", + "AAC" => "N", + "AAG" => "K", + "AAT" => "N", + "ACA" => "T", + "ACC" => "T", + "ACG" => "T", + "ACT" => "T", + "AGA" => "R", + "AGC" => "S", + "AGG" => "R", + "AGT" => "S", + "ATA" => "I", + "ATC" => "I", + "ATG" => "M", + "ATT" => "I", + "CAA" => "Q", + "CAC" => "H", + "CAG" => "Q", + "CAT" => "H", + "CCA" => "P", + "CCC" => "P", + "CCG" => "P", + "CCT" => "P", + "CGA" => "R", + "CGC" => "R", + "CGG" => "R", + "CGT" => "R", + "CTA" => "L", + "CTC" => "L", + "CTG" => "L", + "CTT" => "L", + "GAA" => "E", + "GAC" => "D", + "GAG" => "E", + "GAT" => "D", + "GCA" => "A", + "GCC" => "A", + "GCG" => "A", + "GCT" => "A", + "GGA" => "G", + "GGC" => "G", + "GGG" => "G", + "GGT" => "G", + "GTA" => "V", + "GTC" => "V", + "GTG" => "V", + "GTT" => "V", + "TAA" => "*", + "TAC" => "Y", + "TAG" => "*", + "TAT" => "Y", + "TCA" => "S", + "TCC" => "S", + "TCG" => "S", + "TCT" => "S", + // caveat lector + "TGA" => "*", + "TGC" => "C", + "TGG" => "W", + "TGT" => "C", + "TTA" => "L", + "TTC" => "F", + "TTG" => "L", + "TTT" => "F", + // degenerate codons + "AAR" => "K", + "AAY" => "N", + "ACB" => "T", + "ACD" => "T", + "ACH" => "T", + "ACK" => "T", + "ACM" => "T", + "ACN" => "T", + "ACR" => "T", + "ACS" => "T", + "ACV" => "T", + "ACW" => "T", + "ACY" => "T", + "AGR" => "R", + "AGY" => "S", + "ATH" => "I", + "ATM" => "I", + "ATW" => "I", + "ATY" => "I", + "CAR" => "Q", + "CAY" => "H", + "CCB" => "P", + "CCD" => "P", + "CCH" => "P", + "CCK" => "P", + "CCM" => "P", + "CCN" => "P", + "CCR" => "P", + "CCS" => "P", + "CCV" => "P", + "CCW" => "P", + "CCY" => "P", + "CGB" => "R", + "CGD" => "R", + "CGH" => "R", + "CGK" => "R", + "CGM" => "R", + "CGN" => "R", + "CGR" => "R", + "CGS" => "R", + "CGV" => "R", + "CGW" => "R", + "CGY" => "R", + "CTB" => "L", + "CTD" => "L", + "CTH" => "L", + "CTK" => "L", + "CTM" => "L", + "CTN" => "L", + "CTR" => "L", + "CTS" => "L", + "CTV" => "L", + "CTW" => "L", + "CTY" => "L", + "GAR" => "E", + "GAY" => "D", + "GCB" => "A", + "GCD" => "A", + "GCH" => "A", + "GCK" => "A", + "GCM" => "A", + "GCN" => "A", + "GCR" => "A", + "GCS" => "A", + "GCV" => "A", + "GCW" => "A", + "GCY" => "A", + "GGB" => "G", + "GGD" => "G", + "GGH" => "G", + "GGK" => "G", + "GGM" => "G", + "GGN" => "G", + "GGR" => "G", + "GGS" => "G", + "GGV" => "G", + "GGW" => "G", + "GGY" => "G", + "GTB" => "V", + "GTD" => "V", + "GTH" => "V", + "GTK" => "V", + "GTM" => "V", + "GTN" => "V", + "GTR" => "V", + "GTS" => "V", + "GTV" => "V", + "GTW" => "V", + "GTY" => "V", + "MGA" => "R", + "MGG" => "R", + "MGR" => "R", + "TAR" => "*", + "TAY" => "Y", + "TCB" => "S", + "TCD" => "S", + "TCH" => "S", + "TCK" => "S", + "TCM" => "S", + "TCN" => "S", + "TCR" => "S", + "TCS" => "S", + "TCV" => "S", + "TCW" => "S", + "TCY" => "S", + "TGY" => "C", + "TRA" => "*", + "TTR" => "L", + "TTY" => "F", + "YTA" => "L", + "YTG" => "L", + "YTR" => "L", +}; + +/// Translation table for selenocysteine. +static DNA_TO_AA1_SEC: phf::Map<&'static str, &'static str> = phf_map! { + "AAA" => "K", + "AAC" => "N", + "AAG" => "K", + "AAT" => "N", + "ACA" => "T", + "ACC" => "T", + "ACG" => "T", + "ACT" => "T", + "AGA" => "R", + "AGC" => "S", + "AGG" => "R", + "AGT" => "S", + "ATA" => "I", + "ATC" => "I", + "ATG" => "M", + "ATT" => "I", + "CAA" => "Q", + "CAC" => "H", + "CAG" => "Q", + "CAT" => "H", + "CCA" => "P", + "CCC" => "P", + "CCG" => "P", + "CCT" => "P", + "CGA" => "R", + "CGC" => "R", + "CGG" => "R", + "CGT" => "R", + "CTA" => "L", + "CTC" => "L", + "CTG" => "L", + "CTT" => "L", + "GAA" => "E", + "GAC" => "D", + "GAG" => "E", + "GAT" => "D", + "GCA" => "A", + "GCC" => "A", + "GCG" => "A", + "GCT" => "A", + "GGA" => "G", + "GGC" => "G", + "GGG" => "G", + "GGT" => "G", + "GTA" => "V", + "GTC" => "V", + "GTG" => "V", + "GTT" => "V", + "TAA" => "*", + "TAC" => "Y", + "TAG" => "*", + "TAT" => "Y", + "TCA" => "S", + "TCC" => "S", + "TCG" => "S", + "TCT" => "S", + // caveat lector + "TGA" => "U", + "TGC" => "C", + "TGG" => "W", + "TGT" => "C", + "TTA" => "L", + "TTC" => "F", + "TTG" => "L", + "TTT" => "F", + // degenerate codons + "AAR" => "K", + "AAY" => "N", + "ACB" => "T", + "ACD" => "T", + "ACH" => "T", + "ACK" => "T", + "ACM" => "T", + "ACN" => "T", + "ACR" => "T", + "ACS" => "T", + "ACV" => "T", + "ACW" => "T", + "ACY" => "T", + "AGR" => "R", + "AGY" => "S", + "ATH" => "I", + "ATM" => "I", + "ATW" => "I", + "ATY" => "I", + "CAR" => "Q", + "CAY" => "H", + "CCB" => "P", + "CCD" => "P", + "CCH" => "P", + "CCK" => "P", + "CCM" => "P", + "CCN" => "P", + "CCR" => "P", + "CCS" => "P", + "CCV" => "P", + "CCW" => "P", + "CCY" => "P", + "CGB" => "R", + "CGD" => "R", + "CGH" => "R", + "CGK" => "R", + "CGM" => "R", + "CGN" => "R", + "CGR" => "R", + "CGS" => "R", + "CGV" => "R", + "CGW" => "R", + "CGY" => "R", + "CTB" => "L", + "CTD" => "L", + "CTH" => "L", + "CTK" => "L", + "CTM" => "L", + "CTN" => "L", + "CTR" => "L", + "CTS" => "L", + "CTV" => "L", + "CTW" => "L", + "CTY" => "L", + "GAR" => "E", + "GAY" => "D", + "GCB" => "A", + "GCD" => "A", + "GCH" => "A", + "GCK" => "A", + "GCM" => "A", + "GCN" => "A", + "GCR" => "A", + "GCS" => "A", + "GCV" => "A", + "GCW" => "A", + "GCY" => "A", + "GGB" => "G", + "GGD" => "G", + "GGH" => "G", + "GGK" => "G", + "GGM" => "G", + "GGN" => "G", + "GGR" => "G", + "GGS" => "G", + "GGV" => "G", + "GGW" => "G", + "GGY" => "G", + "GTB" => "V", + "GTD" => "V", + "GTH" => "V", + "GTK" => "V", + "GTM" => "V", + "GTN" => "V", + "GTR" => "V", + "GTS" => "V", + "GTV" => "V", + "GTW" => "V", + "GTY" => "V", + "MGA" => "R", + "MGG" => "R", + "MGR" => "R", + "TAR" => "*", + "TAY" => "Y", + "TCB" => "S", + "TCD" => "S", + "TCH" => "S", + "TCK" => "S", + "TCM" => "S", + "TCN" => "S", + "TCR" => "S", + "TCS" => "S", + "TCV" => "S", + "TCW" => "S", + "TCY" => "S", + "TGY" => "C", + "TRA" => "*", + "TTR" => "L", + "TTY" => "F", + "YTA" => "L", + "YTG" => "L", + "YTR" => "L", +}; + +static IUPAC_AMBIGUITY_CODES: &str = "BDHVNUWSMKRYZ"; + +/// Allow selection of translation table. +pub enum TranslationTable { + Standard, + #[allow(dead_code)] + Selenocysteine, +} + +/// Coerces string of 1- or 3-letter amino acids to 1-letter representation. +/// +/// Fails if the sequence is not of valid 3/1-letter amino acids. +/// +/// # Args +/// +/// * `seq` -- An amino acid sequence. +/// +/// # Returns +/// +/// The sequence as one of 1-letter amino acids. +#[allow(dead_code)] +pub fn aa_to_aa1(seq: &str) -> Result { + if looks_like_aa3_p(seq) { + aa3_to_aa1(seq) + } else { + Ok(seq.to_string()) + } +} + +/// Coerces string of 1- or 3-letter amino acids to 3-letter representation. +/// +/// Fails if the sequence is not of valid 3/1-letter amino acids. +/// +/// # Args +/// +/// * `seq` -- An amino acid sequence. +/// +/// # Returns +/// +/// The sequence as one of 1-letter amino acids. +#[allow(dead_code)] +pub fn aa_to_aa3(seq: &str) -> Result { + if looks_like_aa3_p(seq) { + Ok(seq.to_string()) + } else { + aa1_to_aa3(seq) + } +} + +/// Converts string of 1-letter amino acids to 3-letter amino acids. +/// +/// Fails if the sequence is not of 1-letter amino acids. +/// +/// # Args +/// +/// * `seq` -- An amino acid sequence as 1-letter amino acids. +/// +/// # Returns +/// +/// The sequence as 3-letter amino acids. +#[allow(dead_code)] +pub fn aa1_to_aa3(seq: &str) -> Result { + let mut result = String::new(); + + for i in 0..seq.len() { + let aa3 = AA1_TO_AA3_LUT.get(&seq[i..(i + 1)]).ok_or(anyhow::anyhow!( + "Invalid 1-letter amino acid: {}", + &seq[i..(i + 1)] + ))?; + result.push_str(aa3); + } + + Ok(result) +} + +/// Converts string of 3-letter amino acids to 1-letter amino acids. +/// +/// Fails if the sequence is not of 3-letter amino acids. +/// +/// # Args +/// +/// * `seq` -- An amino acid sequence as 3-letter amino acids. +/// +/// # Returns +/// +/// The sequence as 1-letter amino acids. +#[allow(dead_code)] +pub fn aa3_to_aa1(seq: &str) -> Result { + if seq.len() % 3 != 0 { + return Err(anyhow::anyhow!( + "3-letter amino acid sequence length is not multiple of three" + )); + } + + let mut result = String::new(); + + for i in 0..(seq.len() / 3) { + let aa3 = &seq[(i * 3)..((i + 1) * 3)]; + let aa1 = AA3_TO_AA1_LUT + .get(aa3) + .ok_or(anyhow::anyhow!("Invalid 3-letter amino acid: {}", &aa3,))?; + result.push_str(aa1); + } + + Ok(result) +} + +/// Indicates whether a string looks like a 3-letter AA string. +/// +/// # Args +/// +/// * `looks_like_aa3_p` -- A sequence +/// +/// # Returns +/// +/// Whether the string is of the format of a 3-letter AA string. +#[allow(dead_code)] +fn looks_like_aa3_p(seq: &str) -> bool { + seq.len() % 3 == 0 && seq.chars().nth(1).map(|c| c.is_lowercase()).unwrap_or(true) +} + +/// Translates a DNA or RNA sequence into a single-letter amino acid sequence. +/// +/// # Args +/// +/// * `seq` -- A nucleotide sequence. +/// * `full_codons` -- If `true`, forces sequence to have length that is a multiple of 3 +/// and return an `Err` otherwise. If `false`, `ter_symbol` will be added as the last +/// amino acid. This corresponds to biopython's behavior of padding the last codon with +/// `N` characters. +/// * `ter_symbol` -- Placeholder for the last amino acid if sequence length is not divisible +/// by three and `full_codons` is `false`. +/// * `translation_table` -- Indicates which codon to amino acid translation table to use. +/// +/// # Returns +/// +/// The corresponding single letter amino acid sequence. +pub fn translate_cds( + seq: &str, + full_codons: bool, + ter_symbol: &str, + translation_table: TranslationTable, +) -> Result { + if seq.is_empty() { + return Ok("".to_string()); + } + + if full_codons && seq.len() % 3 != 0 { + return Err(anyhow::anyhow!( + "Sequence length must be a multiple of three" + )); + } + + let translation_table = match translation_table { + TranslationTable::Standard => &DNA_TO_AA1_LUT, + TranslationTable::Selenocysteine => &DNA_TO_AA1_SEC, + }; + + let seq = seq.replace('u', "t").replace('U', "T").to_uppercase(); + + let mut result = String::new(); + for i in 0..(seq.len() / 3) { + let codon = &seq[(i * 3)..((i + 1) * 3)]; + let aa = translation_table.get(codon); + if let Some(aa) = aa { + result.push_str(aa); + } else { + // If this contains an ambiguous code, set aa to X, otherwise, throw error + let mut ok = false; + for i in 0..codon.len() { + if IUPAC_AMBIGUITY_CODES.contains(&codon[i..(i + 1)]) { + ok = true; + result.push('X'); + break; + } + } + if !ok { + return Err(anyhow::anyhow!( + "Codon {} at position {}..{} is undefined in codon table", + codon, + i + 1, + i + 3 + )); + } + } + } + + // Check for trailing bases and add the ter symbol if required. + if !full_codons && seq.len() % 3 != 0 { + result.push_str(ter_symbol); + } + + Ok(result) +} + +/// Converts sequence to normalized representation for hashing. +/// +/// Essentially, removes whitespace and asterisks, and uppercases the string. +/// +/// # Args +/// +/// * `seq` -- The sequence to be normalized. +/// +/// # Returns +/// +/// The sequence as a string of uppercase letters. +pub fn normalize_sequence(seq: &str) -> Result { + let mut result = String::new(); + + for c in seq.chars() { + if !c.is_whitespace() && c != '*' { + let c = c.to_ascii_uppercase(); + if c.is_alphabetic() { + result.push(c) + } else { + return Err(anyhow::anyhow!("Character {} is not alphetic", c)); + } + } + } + + Ok(result) +} + +/// Convert sequence to unicode MD5 hex digest. +/// +/// Fails if normalization is not possible. +/// +/// # Args +/// +/// * `seq` -- A sequence +/// * `normalize` -- Whether to normalize the sequence before conversion, i.e., to ensure +/// representation as uppercase ltters without whitespace or asterisks. +/// +/// # Returns +/// +/// Unicode MD5 hex digest representation of sequence. +pub fn seq_md5(seq: &str, normalize: bool) -> Result { + let seq = if normalize { + normalize_sequence(seq)? + } else { + seq.to_owned() + }; + let mut hasher = Md5::new(); + hasher.update(seq); + let hash = hasher.finalize(); + let mut buf = [0u8; 64]; + let checksum = base16ct::lower::encode_str(&hash, &mut buf).unwrap(); + Ok(checksum.to_owned()) +} + +#[cfg(test)] +mod test { + use super::*; + + use pretty_assertions::assert_eq; + + #[test] + fn suffix_trimming() { + assert_eq!( + trim_common_suffixes("", ""), + (0, "".to_string(), "".to_string()) + ); + assert_eq!( + trim_common_suffixes("", "C"), + (0, "".to_string(), "C".to_string()) + ); + assert_eq!( + trim_common_suffixes("C", ""), + (0, "C".to_string(), "".to_string()) + ); + assert_eq!( + trim_common_suffixes("A", "AA"), + (1, "".to_string(), "A".to_string()) + ); + assert_eq!( + trim_common_suffixes("AT", "AG"), + (0, "AT".to_string(), "AG".to_string()) + ); + assert_eq!( + trim_common_suffixes("ATCG", "AGCG"), + (2, "AT".to_string(), "AG".to_string()) + ); + } + + #[test] + fn prefix_trimming() { + assert_eq!( + trim_common_prefixes("", ""), + (0, "".to_string(), "".to_string()) + ); + assert_eq!( + trim_common_prefixes("", "C"), + (0, "".to_string(), "C".to_string()) + ); + assert_eq!( + trim_common_prefixes("C", ""), + (0, "C".to_string(), "".to_string()) + ); + assert_eq!( + trim_common_prefixes("TA", "GA"), + (0, "TA".to_string(), "GA".to_string()) + ); + assert_eq!( + trim_common_prefixes("CGTA", "CGGA"), + (2, "TA".to_string(), "GA".to_string()) + ); + } + + #[test] + fn revcomp_cases() { + assert_eq!(revcomp(""), ""); + assert_eq!(revcomp("A"), "T"); + assert_eq!(revcomp("AG"), "CT"); + assert_eq!(revcomp("CGAG"), "CTCG"); + } + + #[test] + fn aa_to_aa1_examples() -> Result<(), anyhow::Error> { + assert_eq!(aa_to_aa1("")?, ""); + assert_eq!(aa_to_aa1("CATSARELAME")?, "CATSARELAME"); + assert_eq!( + aa_to_aa1("CysAlaThrSerAlaArgGluLeuAlaMetGlu")?, + "CATSARELAME" + ); + + Ok(()) + } + + #[test] + fn aa1_to_aa3_examples() -> Result<(), anyhow::Error> { + assert_eq!(aa1_to_aa3("")?, ""); + assert_eq!( + aa1_to_aa3("CATSARELAME")?, + "CysAlaThrSerAlaArgGluLeuAlaMetGlu" + ); + + Ok(()) + } + + #[test] + fn aa3_to_aa1_examples() -> Result<(), anyhow::Error> { + assert!(aa3_to_aa1("Te").is_err()); + assert_eq!(aa3_to_aa1("")?, ""); + assert_eq!( + aa3_to_aa1("CysAlaThrSerAlaArgGluLeuAlaMetGlu")?, + "CATSARELAME" + ); + + Ok(()) + } + + #[test] + fn translate_cds_examples() -> Result<(), anyhow::Error> { + assert_eq!( + translate_cds("ATGCGA", true, "*", TranslationTable::Standard)?, + "MR" + ); + assert_eq!( + translate_cds("AUGCGA", true, "*", TranslationTable::Standard)?, + "MR" + ); + assert_eq!( + translate_cds("", true, "*", TranslationTable::Standard)?, + "" + ); + assert!(translate_cds("AUGCG", true, "*", TranslationTable::Standard).is_err()); + assert_eq!( + translate_cds("AUGCG", false, "*", TranslationTable::Standard)?, + "M*" + ); + assert_eq!( + translate_cds("ATGTAN", true, "*", TranslationTable::Standard)?, + "MX" + ); + assert_eq!( + translate_cds("CCN", true, "*", TranslationTable::Standard)?, + "P" + ); + assert_eq!( + translate_cds("TRA", true, "*", TranslationTable::Standard)?, + "*" + ); + assert_eq!( + translate_cds("TTNTA", false, "*", TranslationTable::Standard)?, + "X*" + ); + assert_eq!( + translate_cds("CTB", true, "*", TranslationTable::Standard)?, + "L" + ); + assert_eq!( + translate_cds("AGM", true, "*", TranslationTable::Standard)?, + "X" + ); + assert_eq!( + translate_cds("GAS", true, "*", TranslationTable::Standard)?, + "X" + ); + assert_eq!( + translate_cds("CUN", true, "*", TranslationTable::Standard)?, + "L" + ); + assert!(translate_cds("AUGCGQ", true, "*", TranslationTable::Standard).is_err()); + + Ok(()) + } + + #[test] + fn seq_md5_examples() -> Result<(), anyhow::Error> { + assert_eq!(seq_md5("", true)?, "d41d8cd98f00b204e9800998ecf8427e"); + assert_eq!(seq_md5("ACGT", true)?, "f1f8f4bf413b16ad135722aa4591043e"); + assert_eq!(seq_md5("ACGT*", true)?, "f1f8f4bf413b16ad135722aa4591043e"); + assert_eq!( + seq_md5(" A C G T ", true)?, + "f1f8f4bf413b16ad135722aa4591043e" + ); + assert_eq!(seq_md5("acgt", true)?, "f1f8f4bf413b16ad135722aa4591043e"); + assert_eq!(seq_md5("acgt", false)?, "db516c3913e179338b162b2476d1c23f"); + + Ok(()) + } + + #[test] + fn normalize_sequence_examples() -> Result<(), anyhow::Error> { + assert_eq!(normalize_sequence("ACGT")?, "ACGT"); + assert_eq!(normalize_sequence(" A C G T * ")?, "ACGT"); + assert!(normalize_sequence("ACGT1").is_err()); + + Ok(()) + } +} + +// +// Copyright 2023 hgvs-rs Contributors +// Copyright 2014 Bioutils Contributors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// diff --git a/src/utils.rs b/src/utils.rs deleted file mode 100644 index 113e427..0000000 --- a/src/utils.rs +++ /dev/null @@ -1,123 +0,0 @@ -pub(crate) fn trim_common_prefixes(reference: &str, alternative: &str) -> (usize, String, String) { - if reference.is_empty() || alternative.is_empty() { - return (0, reference.to_string(), alternative.to_string()); - } - - let mut trim = 0; - while trim < reference.len() && trim < alternative.len() { - if reference.chars().nth(trim) != alternative.chars().nth(trim) { - break; - } - - trim += 1; - } - - ( - trim, - reference[trim..].to_string(), - alternative[trim..].to_string(), - ) -} - -pub(crate) fn trim_common_suffixes(reference: &str, alternative: &str) -> (usize, String, String) { - if reference.is_empty() || alternative.is_empty() { - return (0, reference.to_string(), alternative.to_string()); - } - - let mut trim = 0; - let mut i_r = reference.len(); - let mut i_a = alternative.len(); - let mut pad = 0; - while trim < reference.len() && trim < alternative.len() { - trim += 1; - assert!(i_r > 0); - assert!(i_a > 0); - i_r -= 1; - i_a -= 1; - - if reference.chars().nth(i_r) != alternative.chars().nth(i_a) { - pad = 1; - break; - } - } - - ( - trim - pad, - reference[..(i_r + pad)].to_string(), - alternative[..(i_a + pad)].to_string(), - ) -} - -/// Reverse complementing shortcut. -pub(crate) fn revcomp(seq: &str) -> String { - std::str::from_utf8(&bio::alphabets::dna::revcomp(seq.as_bytes())) - .unwrap() - .to_string() -} - -#[cfg(test)] -mod test { - use pretty_assertions::assert_eq; - - use crate::utils::{revcomp, trim_common_prefixes, trim_common_suffixes}; - - #[test] - fn suffix_trimming() { - assert_eq!( - trim_common_suffixes("", ""), - (0, "".to_string(), "".to_string()) - ); - assert_eq!( - trim_common_suffixes("", "C"), - (0, "".to_string(), "C".to_string()) - ); - assert_eq!( - trim_common_suffixes("C", ""), - (0, "C".to_string(), "".to_string()) - ); - assert_eq!( - trim_common_suffixes("A", "AA"), - (1, "".to_string(), "A".to_string()) - ); - assert_eq!( - trim_common_suffixes("AT", "AG"), - (0, "AT".to_string(), "AG".to_string()) - ); - assert_eq!( - trim_common_suffixes("ATCG", "AGCG"), - (2, "AT".to_string(), "AG".to_string()) - ); - } - - #[test] - fn prefix_trimming() { - assert_eq!( - trim_common_prefixes("", ""), - (0, "".to_string(), "".to_string()) - ); - assert_eq!( - trim_common_prefixes("", "C"), - (0, "".to_string(), "C".to_string()) - ); - assert_eq!( - trim_common_prefixes("C", ""), - (0, "C".to_string(), "".to_string()) - ); - assert_eq!( - trim_common_prefixes("TA", "GA"), - (0, "TA".to_string(), "GA".to_string()) - ); - assert_eq!( - trim_common_prefixes("CGTA", "CGGA"), - (2, "TA".to_string(), "GA".to_string()) - ); - } - - #[test] - fn revcomp_cases() { - assert_eq!(revcomp(""), ""); - assert_eq!(revcomp("A"), "T"); - assert_eq!(revcomp("AG"), "CT"); - assert_eq!(revcomp("CGAG"), "CTCG"); - } -} diff --git a/tests/data/mapper/sanity_cp.tsv b/tests/data/mapper/sanity_cp.tsv new file mode 100644 index 0000000..81fc9de --- /dev/null +++ b/tests/data/mapper/sanity_cp.tsv @@ -0,0 +1,10 @@ +accession transcript_sequence cds_start_i cds_end_i +NM_999999.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGCGAAATAGGGG 9 39 +NM_999998.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGCGAAATAGAGGAGGTAGTTTCGC 9 39 +NM_999997.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGTAGAATAGAGGAGGCAGTTTCGC 9 39 +NM_999996.1 AAAATCAAAATGAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 39 +NM_999995.1 AAAATCAAAATGAAAAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 42 +NM_999994.1 AAAATCAAAATGAAAAAAAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 45 +NM_999993.1 AAAATCAAAATGGGGAGGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGC 9 39 +NM_999992.1 AAAATCAAAATGGGGTAGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGC 9 39 +NM_999992.2 AAAATCAAAATGGGGTAGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGCC 9 40 diff --git a/tests/data/seqrepo_cache.fasta b/tests/data/seqrepo_cache.fasta index 7f551fb..20066a5 100644 --- a/tests/data/seqrepo_cache.fasta +++ b/tests/data/seqrepo_cache.fasta @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:71636c78aeaa44eb4943092d66e6ce17c328ac0c230e974ea62e9fb8cb17fa32 -size 20268 +oid sha256:b804ae2dcf45b8f6ffdb21be233c93ddf4dda4f6856df693b717aba237c20a31 +size 33376