Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: change database path setup (#48) #60

Merged
merged 2 commits into from
Apr 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ cd ../mehari
cargo run --release -- \
-v \
db create txs \
--path-out /tmp/txs-out.bin \
--path-out /tmp/txs-out.bin.zst \
--path-cdot-json ../cdot-0.2.12.ensembl.grch37_grch38.json.gz \
--path-cdot-json ../cdot-0.2.12.refseq.grch37_grch38.json.gz \
--path-seqrepo-instance ../hgvs-rs-data/seqrepo-data/master/master
Expand Down
6 changes: 3 additions & 3 deletions docs/db_build.md
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ You can build the transcript database protocolbuffers binary using the following

```text
$ mehari db create txs \
--path-out output/b37/txs.bin \
--path-out output/db/grch37/txs.bin.zst \
\
--path-seqrepo-instance path/to/seqrepo-data/master \
\
Expand All @@ -158,15 +158,15 @@ $ mehari db create txs \
You will have to build the transcript database for each genome release that you want and manually specify the release to `--genome-release`.
For GRCh38, simply use `--genome-release grch38`.

You can enable compression by using the suffix `.gz` for gzip compression and `.zstd` for zstandard compression.
You can enable compression by using the suffix `.gz` for gzip compression and `.zst` for zstandard compression.

# Building ClinVar Database

This assumes that you have converted a recent ClinVar XML file to TSV using [clinvar-tsv](https://github.com/bihealth/clinvar-tsv).

```
$ mehari db create seqvar-clinvar \
--path-output-db ~/Data/mehari/db/seqvars/grch37/clinvar \
--path-output-db ~/Data/mehari/db/grch37/seqsvars/clinvar \
--path-clinvar-tsv path/to/clinvar_seqvars.b37.tsv.gz
```

Expand Down
4 changes: 2 additions & 2 deletions src/annotate/seqvars/csq.rs
Original file line number Diff line number Diff line change
Expand Up @@ -702,7 +702,7 @@ mod test {

#[test]
fn annotate_snv_brca1_one_variant() -> Result<(), anyhow::Error> {
let tx_path = "tests/data/annotate/db/seqvars/grch37/txs.bin";
let tx_path = "tests/data/annotate/db/grch37/txs.bin.zst";
let tx_db = load_tx_db(tx_path)?;
let provider = Rc::new(MehariProvider::new(tx_db, Assembly::Grch37p10));

Expand Down Expand Up @@ -829,7 +829,7 @@ mod test {
}

fn annotate_vars(path_tsv: &str, txs: &[String]) -> Result<(), anyhow::Error> {
let tx_path = "tests/data/annotate/db/seqvars/grch37/txs.bin";
let tx_path = "tests/data/annotate/db/grch37/txs.bin.zst";
let tx_db = load_tx_db(tx_path)?;
let provider = Rc::new(MehariProvider::new(tx_db, Assembly::Grch37p10));
let predictor = ConsequencePredictor::new(provider, Assembly::Grch37p10);
Expand Down
16 changes: 7 additions & 9 deletions src/annotate/seqvars/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ pub struct PathOutput {
pub path_output_vcf: Option<String>,

/// Path to the output TSV file (for import into VarFish).
#[arg(long, requires = "path-input-ped")]
#[arg(long)]
pub path_output_tsv: Option<String>,
}

Expand Down Expand Up @@ -489,7 +489,7 @@ pub fn load_tx_db(tx_path: &str) -> Result<TxSeqDatabase, anyhow::Error> {
.map_err(|e| anyhow!("failed to open file {}: {}", tx_path, e))?;
let mut reader: Box<dyn Read> = if tx_path.ends_with(".gz") {
Box::new(flate2::read::MultiGzDecoder::new(file))
} else if tx_path.ends_with(".zstd") {
} else if tx_path.ends_with(".zst") {
Box::new(
zstd::Decoder::new(file)
.map_err(|e| anyhow!("failed to open zstd decoder for {}: {}", tx_path, e))?,
Expand All @@ -499,11 +499,9 @@ pub fn load_tx_db(tx_path: &str) -> Result<TxSeqDatabase, anyhow::Error> {
};

// Now read the whole file into a byte buffer.
let metadata = std::fs::metadata(tx_path)
.map_err(|e| anyhow!("failed to get metadata for {}: {}", tx_path, e))?;
let mut buffer = vec![0; metadata.len() as usize];
let mut buffer = Vec::new();
reader
.read(&mut buffer)
.read_to_end(&mut buffer)
.map_err(|e| anyhow!("failed to read file {}: {}", tx_path, e))?;

// Deserialize the buffer with prost.
Expand Down Expand Up @@ -1398,7 +1396,7 @@ fn run_with_writer(writer: &mut dyn AnnotatedVcfWriter, args: &Args) -> Result<(
// Open the frequency RocksDB database in read only mode.
tracing::info!("Opening frequency database");
let rocksdb_path = format!(
"{}/seqvars/{}/freqs",
"{}/{}/seqvars/freqs",
&args.path_db,
path_component(assembly)
);
Expand All @@ -1418,7 +1416,7 @@ fn run_with_writer(writer: &mut dyn AnnotatedVcfWriter, args: &Args) -> Result<(
// Open the ClinVar RocksDB database in read only mode.
tracing::info!("Opening ClinVar database");
let rocksdb_path = format!(
"{}/seqvars/{}/clinvar",
"{}/{}/seqvars/clinvar",
&args.path_db,
path_component(assembly)
);
Expand All @@ -1436,7 +1434,7 @@ fn run_with_writer(writer: &mut dyn AnnotatedVcfWriter, args: &Args) -> Result<(
// Open the serialized transcripts.
tracing::info!("Opening transcript database");
let tx_db = load_tx_db(&format!(
"{}/seqvars/{}/txs.bin",
"{}/{}/txs.bin.zst",
&args.path_db,
path_component(assembly)
))?;
Expand Down
3 changes: 1 addition & 2 deletions src/annotate/strucvars/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2591,7 +2591,7 @@ fn run_with_writer(
let file_date = args
.file_date
.as_ref()
.map(|v| v.clone())
.cloned()
.unwrap_or(Utc::now().date_naive().format("%Y%m%d").to_string());
let header_out = vcf_header::build(
args.genome_release
Expand Down Expand Up @@ -2787,7 +2787,6 @@ mod test {
use rstest::rstest;
use std::fs::File;

use chrono::NaiveDate;
use clap_verbosity_flag::Verbosity;
use hgvs::static_data::Assembly;
use linked_hash_map::LinkedHashMap;
Expand Down
4 changes: 2 additions & 2 deletions src/db/create/txs/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,7 @@ fn build_protobuf(
file,
flate2::Compression::default(),
))
} else if path_out.ends_with(".zstd") {
} else if path_out.ends_with(".zst") {
Box::new(zstd::Encoder::new(file, 0).map_err(|e| {
anyhow!(
"failed to open zstd enoder for {}: {}",
Expand Down Expand Up @@ -856,7 +856,7 @@ pub mod test {
verbose: Verbosity::new(0, 1),
};
let args = Args {
path_out: tmp_dir.join("out.bin"),
path_out: tmp_dir.join("out.bin.zst"),
path_cdot_json: vec![PathBuf::from(
"tests/data/db/create/txs/cdot-0.2.12.refseq.grch37_grch38.brca1_opa1.json",
)],
Expand Down
2 changes: 1 addition & 1 deletion src/verify/seqvars.rs
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ pub fn run(_common: &crate::common::Args, args: &Args) -> Result<(), anyhow::Err
// Read the serialized transcripts.
tracing::info!("Opening transcript database");
let tx_db = load_tx_db(&format!(
"{}/seqvars/{}/txs.bin",
"{}/{}/txs.bin.zst",
&args.path_db,
path_component(assembly)
))?;
Expand Down
3 changes: 3 additions & 0 deletions tests/data/annotate/db/grch37/bootstrap.sh
Git LFS file not shown
3 changes: 3 additions & 0 deletions tests/data/annotate/db/grch37/seqvars/clinvar/LOG
Git LFS file not shown
3 changes: 3 additions & 0 deletions tests/data/annotate/db/grch37/txs.bin.zst
Git LFS file not shown
19 changes: 0 additions & 19 deletions tests/data/annotate/db/seqvars/grch37/bootstrap.sh

This file was deleted.

3 changes: 0 additions & 3 deletions tests/data/annotate/db/seqvars/grch37/clinvar/LOG

This file was deleted.

3 changes: 0 additions & 3 deletions tests/data/annotate/db/seqvars/grch37/txs.bin

This file was deleted.