Skip to content

Commit

Permalink
feat: ported over variant normalizer (#19)
Browse files Browse the repository at this point in the history
Also contains parts of the variant mapper as far as it was required.
  • Loading branch information
holtgrewe committed Feb 24, 2023
1 parent f70b5c5 commit 827a54f
Show file tree
Hide file tree
Showing 22 changed files with 4,836 additions and 182 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ jobs:
env:
TEST_UTA_DATABASE_URL: postgres://uta_admin:[email protected]/uta
TEST_UTA_DATABASE_SCHEMA: uta_20210129
TEST_SEQREPO_CACHE_MODE: read
TEST_SEQREPO_CACHE_PATH: tests/data/seqrepo_cache.fasta

- name: Codecov
uses: codecov/codecov-action@v3
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@

*~
.*.sw?
/.vscode
7 changes: 7 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,21 @@ edition = "2021"

[dependencies]
anyhow = "1.0.69"
bio = "1.1.0"
chrono = "0.4.23"
enum-map = "2.4.2"
flate2 = "1.0.25"
lazy_static = "1.4.0"
linked-hash-map = "0.5.6"
log = "0.4.17"
nom = "7.1.3"
postgres = { version = "0.19.4", features = ["with-chrono-0_4"] }
pretty_assertions = "1.3.0"
regex = "1.7.1"
seqrepo = { git = "https://github.com/bihealth/seqrepo-rs.git", branch = "27-whole-fasta-sequences-are-read" }
serde = { version = "1.0.152", features = ["derive"] }
serde_json = "1.0.93"

[dev-dependencies]
env_logger = "0.10.0"
test-log = "0.2.11"
44 changes: 43 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,47 @@ To use the public database:
```
export TEST_UTA_DATABASE_URL=postgres://anonymous:[email protected]:/uta
export TEST_UTA_DATABASE_SCHEMA=uta_20210129
$ cargo test
```

Note that [seqrepo-rs](https://github.com/bihealth/seqrepo-rs) is used for access to the genome contig sequence.
It is inconvenient to provide sub sets of sequences in SeqRepo format.
Instead, we use a build-cache/read-cache approach that is also used by `biocommons/hgvs`.

To build the cache, you will first need a download of the seqrepo [as described in biocommons/biocommons.seqrepo Quickstart](https://github.com/biocommons/biocommons.seqrepo#quick-start).
Then, you configure the running of tests for `hgvs-rs` as follows:

```
export TEST_SEQREPO_CACHE_MODE=write
export TEST_SEQREPO_PATH=path/to/seqrepo/instance
export TEST_SEQREPO_CACHE_PATH=tests/data/seqrepo_cache.fasta
```

When running the tests with `cargo test`, the cache file will be (re-)written.
If you don't want to regenerate the cache then you can use the following settings.
With these settings, the cache will only be read.

```
export TEST_SEQREPO_CACHE_MODE=read
export TEST_SEQREPO_CACHE_PATH=tests/data/seqrepo_cache.fasta
```

After either this, you can run the tests.

```
cargo test
```

## Creating Recuded UTA Databases

The script `tests/data/data/bootstrap.sh` allows to easily build a reduced set of the UTA database given a list of genes.
The process is as follows:

1. You edit `bootstrap.sh` to include the HGNC gene symbols of the transcripts that you want to use.
2. You run the bootstrap script.
This will download the given UTA dump and reduce it to the information related to these transcripts.

```
$ bootstrap.sh http://dl.biocommons.org/uta uta_20210129
```

The `*.pgd.gz` file is added to the Git repository via `git-lfs` and in CI, this minimal database will be used.
4 changes: 3 additions & 1 deletion src/data/interface.rs
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,9 @@ pub trait Provider {
/// # Arguments
///
/// * `ac` -- accession
fn get_seq(&self, ac: &str) -> Result<String, anyhow::Error>;
fn get_seq(&self, ac: &str) -> Result<String, anyhow::Error> {
self.get_seq_part(ac, None, None)
}

/// Return sequence part for the given accession.
///
Expand Down
1 change: 1 addition & 0 deletions src/data/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
///! Datatypes, interfaces, and data acess.
pub mod interface;
pub mod uta;
pub mod uta_sr;
11 changes: 5 additions & 6 deletions src/data/uta.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,7 @@ pub struct Config {
/// URL with the connection string, e.g.
/// `"postgresql://anonymous:[email protected]/uta'"`.
pub db_url: String,
/// The databaser schema to use, corresponds to the data version, e.g.,
/// `uta_20210129`.
/// The databaser schema to use, corresponds to the data version, e.g., `uta_20210129`.
pub db_schema: String,
}

Expand Down Expand Up @@ -155,6 +154,10 @@ impl TryFrom<Row> for TxMappingOptionsRecord {
}
}

/// This provider provides information from a UTA Postgres database only.
///
/// The sequences are also read from the database which implies that no genome contig information
/// is available. Use `uta_sr::Provider` for a variant that is enabled to use a SeqRepo.
pub struct Provider {
/// Configuration for the access.
config: Config,
Expand Down Expand Up @@ -238,10 +241,6 @@ impl ProviderInterface for Provider {
Ok(None)
}

fn get_seq(&self, ac: &str) -> Result<String, anyhow::Error> {
self.get_seq_part(ac, None, None)
}

fn get_seq_part(
&self,
ac: &str,
Expand Down
253 changes: 253 additions & 0 deletions src/data/uta_sr.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,253 @@
//! Code for enabling UTA access where sequences from a SeqRepo.
//!
//! * https://github.com/biocommons/uta
//! * https://github.com/biocommons/biocommons.seqrepo
//! * https://github.com/bihealth/seqrepo-rs

use std::path::PathBuf;
use std::rc::Rc;

use crate::data::uta::{Config as UtaConfig, Provider as UtaProvider};
use crate::data::{
interface::GeneInfoRecord, interface::Provider as ProviderInterface, interface::TxExonsRecord,
interface::TxForRegionRecord, interface::TxIdentityInfo, interface::TxInfoRecord,
interface::TxMappingOptionsRecord, interface::TxSimilarityRecord,
};
use seqrepo::{AliasOrSeqId, Interface as SeqRepoInterface, SeqRepo};

/// Configurationf or the `data::uta::Provider`.
#[derive(Debug, PartialEq, Clone)]
pub struct Config {
/// URL with the connection string, e.g.
/// `"postgresql://anonymous:[email protected]/uta'"`.
pub db_url: String,
/// The databaser schema to use, corresponds to the data version, e.g., `uta_20210129`.
pub db_schema: String,
/// Path to the seqrepo directory, e.g., `/usr/local/share/seqrepo/latest`. The last path
/// component is the "instance" name.
pub seqrepo_path: String,
}

/// This provider provides information from a UTA and a SeqRepo.
///
/// Transcripts from a UTA Postgres database, sequences comes from a SeqRepo. This makes
/// genome contig information available in contrast to `data::uta::Provider`.
pub struct Provider {
inner: UtaProvider,
seqrepo: Rc<dyn SeqRepoInterface>,
}

impl Provider {
/// Create a new provider that uses UTA and SeqRepo information from the given configuration.
///
/// This uses `seqrepo::SeqRepo` for the sequence repository. You can inject any
/// `seqrepo::Interface` implementation using `Provider::with_seqrepo`.
pub fn new(config: Config) -> Result<Provider, anyhow::Error> {
let seqrepo = PathBuf::from(&config.seqrepo_path);
let path = seqrepo
.parent()
.ok_or(anyhow::anyhow!(
"Could not get parent from {}",
&config.seqrepo_path
))?
.to_str()
.unwrap()
.to_string();
let instance = seqrepo
.file_name()
.ok_or(anyhow::anyhow!(
"Could not get basename from {}",
&config.seqrepo_path
))?
.to_str()
.unwrap()
.to_string();

Ok(Self {
inner: UtaProvider::with_config(&UtaConfig {
db_url: config.db_url.clone(),
db_schema: config.db_schema,
})?,
seqrepo: Rc::new(SeqRepo::new(path, &instance)?),
})
}

/// Create a new provider allowing to inject a seqrepo.
pub fn with_seqrepo(
config: Config,
seqrepo: Rc<dyn SeqRepoInterface>,
) -> Result<Provider, anyhow::Error> {
Ok(Self {
inner: UtaProvider::with_config(&UtaConfig {
db_url: config.db_url.clone(),
db_schema: config.db_schema,
})?,
seqrepo,
})
}
}

impl ProviderInterface for Provider {
fn data_version(&self) -> &str {
self.inner.data_version()
}

fn schema_version(&self) -> &str {
self.inner.schema_version()
}

fn get_assembly_map(
&self,
assembly: crate::static_data::Assembly,
) -> linked_hash_map::LinkedHashMap<String, String> {
self.inner.get_assembly_map(assembly)
}

fn get_gene_info(&self, hgnc: &str) -> Result<GeneInfoRecord, anyhow::Error> {
self.inner.get_gene_info(hgnc)
}

fn get_pro_ac_for_tx_ac(&self, tx_ac: &str) -> Result<Option<String>, anyhow::Error> {
self.inner.get_pro_ac_for_tx_ac(tx_ac)
}

fn get_seq_part(
&self,
ac: &str,
begin: Option<usize>,
end: Option<usize>,
) -> Result<String, anyhow::Error> {
let aos = AliasOrSeqId::Alias {
value: ac.to_owned(),
namespace: None,
};
self.seqrepo.fetch_sequence_part(&aos, begin, end)
}

fn get_similar_transcripts(
&self,
tx_ac: &str,
) -> Result<Vec<TxSimilarityRecord>, anyhow::Error> {
self.inner.get_similar_transcripts(tx_ac)
}

fn get_tx_exons(
&self,
tx_ac: &str,
alt_ac: &str,
alt_aln_method: &str,
) -> Result<Vec<TxExonsRecord>, anyhow::Error> {
self.inner.get_tx_exons(tx_ac, alt_ac, alt_aln_method)
}

fn get_tx_for_gene(&self, gene: &str) -> Result<Vec<TxInfoRecord>, anyhow::Error> {
self.inner.get_tx_for_gene(gene)
}

fn get_tx_for_region(
&self,
alt_ac: &str,
alt_aln_method: &str,
start_i: i32,
end_i: i32,
) -> Result<Vec<TxForRegionRecord>, anyhow::Error> {
self.inner
.get_tx_for_region(alt_ac, alt_aln_method, start_i, end_i)
}

fn get_tx_identity_info(&self, tx_ac: &str) -> Result<TxIdentityInfo, anyhow::Error> {
self.inner.get_tx_identity_info(tx_ac)
}

fn get_tx_info(
&self,
tx_ac: &str,
alt_ac: &str,
alt_aln_method: &str,
) -> Result<TxInfoRecord, anyhow::Error> {
self.inner.get_tx_info(tx_ac, alt_ac, alt_aln_method)
}

fn get_tx_mapping_options(
&self,
tax_ac: &str,
) -> Result<Vec<TxMappingOptionsRecord>, anyhow::Error> {
self.inner.get_tx_mapping_options(tax_ac)
}
}

/// Code for helping setup of UTA providers, e.g., for setting up caching of SeqRepo results.
pub mod test_helpers {
use std::{fs::File, path::PathBuf, rc::Rc};

use seqrepo::{
CacheReadingSeqRepo, CacheWritingSeqRepo, Interface as SeqRepoInterface, SeqRepo,
};

use super::{Config, Provider, ProviderInterface};

/// Setup a UTA Provider with data source depending on environment variables.
///
/// See README.md for information on environment variable setup.
pub fn build_provider() -> Result<Rc<dyn ProviderInterface>, anyhow::Error> {
let db_url = std::env::var("TEST_UTA_DATABASE_URL")
.expect("Environment variable TEST_UTA_DATABASE_URL undefined!");
let db_schema = std::env::var("TEST_UTA_DATABASE_SCHEMA")
.expect("Environment variable TEST_UTA_DATABASE_SCHEMA undefined!");
let sr_cache_mode = std::env::var("TEST_SEQREPO_CACHE_MODE")
.expect("Environment variable TEST_SEQREPO_CACHE_MODE undefined!");
let sr_cache_path = std::env::var("TEST_SEQREPO_CACHE_PATH")
.expect("Environment variable TEST_SEQREPO_CACHE_PATH undefined!");

let (seqrepo, seqrepo_path) = if sr_cache_mode == "read" {
let seqrepo: Rc<dyn SeqRepoInterface> =
Rc::new(CacheReadingSeqRepo::new(sr_cache_path)?);
(seqrepo, "".to_string())
} else if sr_cache_mode == "write" {
build_writing_sr(sr_cache_path)?
} else {
panic!("Invalid cache mode {}", &sr_cache_mode);
};

Ok(Rc::new(Provider::with_seqrepo(
Config {
db_url,
db_schema,
seqrepo_path,
},
seqrepo,
)?))
}

/// Helper that builds the cache writing SeqRepo with inner stock SeqRepo.
fn build_writing_sr(
sr_cache_path: String,
) -> Result<(Rc<dyn SeqRepoInterface>, String), anyhow::Error> {
let seqrepo_path = std::env::var("TEST_SEQREPO_PATH")
.expect("Environment variable TEST_SEQREPO_PATH undefined!");
let path_buf = PathBuf::from(seqrepo_path.clone());
let path = path_buf
.parent()
.ok_or(anyhow::anyhow!(
"Could not get parent from {}",
&seqrepo_path
))?
.to_str()
.unwrap()
.to_string();
let instance = path_buf
.file_name()
.ok_or(anyhow::anyhow!(
"Could not get basename from {}",
&seqrepo_path
))?
.to_str()
.unwrap()
.to_string();
let seqrepo: Rc<dyn SeqRepoInterface> = Rc::new(CacheWritingSeqRepo::new(
SeqRepo::new(path, &instance)?,
File::create(sr_cache_path)?,
));
Ok((seqrepo, seqrepo_path))
}
}
Loading

0 comments on commit 827a54f

Please sign in to comment.