Skip to content

Commit

Permalink
feat: implement VariantMapper (#12) (#13)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe authored Feb 28, 2023
1 parent 8593068 commit 44b10de
Show file tree
Hide file tree
Showing 17 changed files with 2,637 additions and 261 deletions.
5 changes: 5 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,17 @@ edition = "2021"

[dependencies]
anyhow = "1.0.69"
base16ct = "0.2.0"
bio = "1.1.0"
chrono = "0.4.23"
enum-map = "2.4.2"
flate2 = "1.0.25"
lazy_static = "1.4.0"
linked-hash-map = "0.5.6"
log = "0.4.17"
md-5 = "0.10.5"
nom = "7.1.3"
phf = { version = "0.11.1", features = ["macros"] }
postgres = { version = "0.19.4", features = ["with-chrono-0_4"] }
pretty_assertions = "1.3.0"
regex = "1.7.1"
Expand All @@ -21,5 +24,7 @@ serde = { version = "1.0.152", features = ["derive"] }
serde_json = "1.0.93"

[dev-dependencies]
csv = "1.2.0"
env_logger = "0.10.0"
serde = { version = "1.0.152", features = ["derive"] }
test-log = "0.2.11"
6 changes: 6 additions & 0 deletions src/data/interface.rs
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,12 @@ pub trait Provider {
end: Option<usize>,
) -> Result<String, anyhow::Error>;

/// Returns a list of protein accessions for a given sequence.
///
/// The list is guaranteed to contain at least one element with the MD5-based accession
/// (MD5_01234abc..def56789) at the end of the list.
fn get_acs_for_protein_seq(&self, seq: &str) -> Result<Vec<String>, anyhow::Error>;

/// Return a list of transcripts that are similar to the given transcript, with relevant
/// similarity criteria.
///
Expand Down
18 changes: 18 additions & 0 deletions src/data/uta.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use postgres::{Client, NoTls, Row};
use std::fmt::Debug;
use std::sync::Mutex;

use crate::sequences::seq_md5;
use crate::static_data::{Assembly, ASSEMBLY_INFOS};

use crate::data::{
Expand Down Expand Up @@ -276,6 +277,23 @@ impl ProviderInterface for Provider {
Ok(seq[begin..end].into())
}

fn get_acs_for_protein_seq(&self, seq: &str) -> Result<Vec<String>, anyhow::Error> {
let md5 = seq_md5(seq, true)?;
let sql = format!(
"SELECT ac FROM {}.seq_anno WHERE seq_id = $1",
self.config.db_schema
);

let mut result = Vec::new();
for row in self.conn.lock().unwrap().query(&sql, &[&md5])? {
result.push(row.get(0));
}

// Add sentinel sequence.
result.push(format!("MD5_{}", &md5));
Ok(result)
}

fn get_similar_transcripts(
&self,
tx_ac: &str,
Expand Down
4 changes: 4 additions & 0 deletions src/data/uta_sr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,10 @@ impl ProviderInterface for Provider {
self.seqrepo.fetch_sequence_part(&aos, begin, end)
}

fn get_acs_for_protein_seq(&self, seq: &str) -> Result<Vec<String>, anyhow::Error> {
self.inner.get_acs_for_protein_seq(seq)
}

fn get_similar_transcripts(
&self,
tx_ac: &str,
Expand Down
2 changes: 1 addition & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@ pub mod data;
pub mod mapper;
pub mod normalizer;
pub mod parser;
pub(crate) mod sequences;
pub mod static_data;
pub(crate) mod utils;
pub mod validator;
27 changes: 6 additions & 21 deletions src/mapper/alignment.rs
Original file line number Diff line number Diff line change
Expand Up @@ -484,15 +484,12 @@ impl Mapper {

#[cfg(test)]
mod test {
use std::{rc::Rc, str::FromStr};
use std::str::FromStr;

use pretty_assertions::assert_eq;

use crate::{
data::{
interface::{Provider as Interface, TxExonsRecord},
uta::{Config, Provider},
},
data::{interface::TxExonsRecord, uta_sr::test_helpers::build_provider},
parser::{CdsFrom, CdsInterval, CdsPos, GenomeInterval, Mu, TxInterval, TxPos},
};

Expand Down Expand Up @@ -563,30 +560,19 @@ mod test {
assert_eq!(none_if_default(-1i32), Some(-1i32));
}

fn get_config() -> Config {
Config {
db_url: std::env::var("TEST_UTA_DATABASE_URL")
.expect("Environment variable TEST_UTA_DATABASE_URL undefined!"),
db_schema: std::env::var("TEST_UTA_DATABASE_SCHEMA")
.expect("Environment variable TEST_UTA_DATABASE_SCHEMA undefined!"),
}
}

#[test]
fn construction() -> Result<(), anyhow::Error> {
let config = get_config();
let provider = Provider::with_config(&config)?;
let provider = build_provider()?;

assert_eq!(provider.data_version(), config.db_schema);
assert_eq!(provider.data_version(), "uta_20210129");
assert_eq!(provider.schema_version(), "1.1");

Ok(())
}

#[test]
fn failures() -> Result<(), anyhow::Error> {
let config = get_config();
let provider = Rc::new(Provider::with_config(&config)?);
let provider = build_provider()?;

// unknown sequences

Expand Down Expand Up @@ -640,8 +626,7 @@ mod test {
alt_ac: &str,
cases: &Vec<(GenomeInterval, TxInterval, CdsInterval)>,
) -> Result<(), anyhow::Error> {
let config = get_config();
let provider = Rc::new(Provider::with_config(&config)?);
let provider = build_provider()?;
let mapper = Mapper::new(provider, tx_ac, alt_ac, "splign")?;

for (g_interval, n_interval, c_interval) in cases {
Expand Down
Loading

0 comments on commit 44b10de

Please sign in to comment.