Skip to content

Commit

Permalink
feat: change database path setup (#48) (#60)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe authored Apr 24, 2023
1 parent 262e86d commit b26671f
Show file tree
Hide file tree
Showing 24 changed files with 26 additions and 45 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ cd ../mehari
cargo run --release -- \
-v \
db create txs \
--path-out /tmp/txs-out.bin \
--path-out /tmp/txs-out.bin.zst \
--path-cdot-json ../cdot-0.2.12.ensembl.grch37_grch38.json.gz \
--path-cdot-json ../cdot-0.2.12.refseq.grch37_grch38.json.gz \
--path-seqrepo-instance ../hgvs-rs-data/seqrepo-data/master/master
Expand Down
6 changes: 3 additions & 3 deletions docs/db_build.md
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ You can build the transcript database protocolbuffers binary using the following

```text
$ mehari db create txs \
--path-out output/b37/txs.bin \
--path-out output/db/grch37/txs.bin.zst \
\
--path-seqrepo-instance path/to/seqrepo-data/master \
\
Expand All @@ -158,15 +158,15 @@ $ mehari db create txs \
You will have to build the transcript database for each genome release that you want and manually specify the release to `--genome-release`.
For GRCh38, simply use `--genome-release grch38`.

You can enable compression by using the suffix `.gz` for gzip compression and `.zstd` for zstandard compression.
You can enable compression by using the suffix `.gz` for gzip compression and `.zst` for zstandard compression.

# Building ClinVar Database

This assumes that you have converted a recent ClinVar XML file to TSV using [clinvar-tsv](https://github.com/bihealth/clinvar-tsv).

```
$ mehari db create seqvar-clinvar \
--path-output-db ~/Data/mehari/db/seqvars/grch37/clinvar \
--path-output-db ~/Data/mehari/db/grch37/seqsvars/clinvar \
--path-clinvar-tsv path/to/clinvar_seqvars.b37.tsv.gz
```

Expand Down
4 changes: 2 additions & 2 deletions src/annotate/seqvars/csq.rs
Original file line number Diff line number Diff line change
Expand Up @@ -702,7 +702,7 @@ mod test {

#[test]
fn annotate_snv_brca1_one_variant() -> Result<(), anyhow::Error> {
let tx_path = "tests/data/annotate/db/seqvars/grch37/txs.bin";
let tx_path = "tests/data/annotate/db/grch37/txs.bin.zst";
let tx_db = load_tx_db(tx_path)?;
let provider = Rc::new(MehariProvider::new(tx_db, Assembly::Grch37p10));

Expand Down Expand Up @@ -829,7 +829,7 @@ mod test {
}

fn annotate_vars(path_tsv: &str, txs: &[String]) -> Result<(), anyhow::Error> {
let tx_path = "tests/data/annotate/db/seqvars/grch37/txs.bin";
let tx_path = "tests/data/annotate/db/grch37/txs.bin.zst";
let tx_db = load_tx_db(tx_path)?;
let provider = Rc::new(MehariProvider::new(tx_db, Assembly::Grch37p10));
let predictor = ConsequencePredictor::new(provider, Assembly::Grch37p10);
Expand Down
16 changes: 7 additions & 9 deletions src/annotate/seqvars/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ pub struct PathOutput {
pub path_output_vcf: Option<String>,

/// Path to the output TSV file (for import into VarFish).
#[arg(long, requires = "path-input-ped")]
#[arg(long)]
pub path_output_tsv: Option<String>,
}

Expand Down Expand Up @@ -489,7 +489,7 @@ pub fn load_tx_db(tx_path: &str) -> Result<TxSeqDatabase, anyhow::Error> {
.map_err(|e| anyhow!("failed to open file {}: {}", tx_path, e))?;
let mut reader: Box<dyn Read> = if tx_path.ends_with(".gz") {
Box::new(flate2::read::MultiGzDecoder::new(file))
} else if tx_path.ends_with(".zstd") {
} else if tx_path.ends_with(".zst") {
Box::new(
zstd::Decoder::new(file)
.map_err(|e| anyhow!("failed to open zstd decoder for {}: {}", tx_path, e))?,
Expand All @@ -499,11 +499,9 @@ pub fn load_tx_db(tx_path: &str) -> Result<TxSeqDatabase, anyhow::Error> {
};

// Now read the whole file into a byte buffer.
let metadata = std::fs::metadata(tx_path)
.map_err(|e| anyhow!("failed to get metadata for {}: {}", tx_path, e))?;
let mut buffer = vec![0; metadata.len() as usize];
let mut buffer = Vec::new();
reader
.read(&mut buffer)
.read_to_end(&mut buffer)
.map_err(|e| anyhow!("failed to read file {}: {}", tx_path, e))?;

// Deserialize the buffer with prost.
Expand Down Expand Up @@ -1398,7 +1396,7 @@ fn run_with_writer(writer: &mut dyn AnnotatedVcfWriter, args: &Args) -> Result<(
// Open the frequency RocksDB database in read only mode.
tracing::info!("Opening frequency database");
let rocksdb_path = format!(
"{}/seqvars/{}/freqs",
"{}/{}/seqvars/freqs",
&args.path_db,
path_component(assembly)
);
Expand All @@ -1418,7 +1416,7 @@ fn run_with_writer(writer: &mut dyn AnnotatedVcfWriter, args: &Args) -> Result<(
// Open the ClinVar RocksDB database in read only mode.
tracing::info!("Opening ClinVar database");
let rocksdb_path = format!(
"{}/seqvars/{}/clinvar",
"{}/{}/seqvars/clinvar",
&args.path_db,
path_component(assembly)
);
Expand All @@ -1436,7 +1434,7 @@ fn run_with_writer(writer: &mut dyn AnnotatedVcfWriter, args: &Args) -> Result<(
// Open the serialized transcripts.
tracing::info!("Opening transcript database");
let tx_db = load_tx_db(&format!(
"{}/seqvars/{}/txs.bin",
"{}/{}/txs.bin.zst",
&args.path_db,
path_component(assembly)
))?;
Expand Down
3 changes: 1 addition & 2 deletions src/annotate/strucvars/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2591,7 +2591,7 @@ fn run_with_writer(
let file_date = args
.file_date
.as_ref()
.map(|v| v.clone())
.cloned()
.unwrap_or(Utc::now().date_naive().format("%Y%m%d").to_string());
let header_out = vcf_header::build(
args.genome_release
Expand Down Expand Up @@ -2787,7 +2787,6 @@ mod test {
use rstest::rstest;
use std::fs::File;

use chrono::NaiveDate;
use clap_verbosity_flag::Verbosity;
use hgvs::static_data::Assembly;
use linked_hash_map::LinkedHashMap;
Expand Down
4 changes: 2 additions & 2 deletions src/db/create/txs/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,7 @@ fn build_protobuf(
file,
flate2::Compression::default(),
))
} else if path_out.ends_with(".zstd") {
} else if path_out.ends_with(".zst") {
Box::new(zstd::Encoder::new(file, 0).map_err(|e| {
anyhow!(
"failed to open zstd enoder for {}: {}",
Expand Down Expand Up @@ -856,7 +856,7 @@ pub mod test {
verbose: Verbosity::new(0, 1),
};
let args = Args {
path_out: tmp_dir.join("out.bin"),
path_out: tmp_dir.join("out.bin.zst"),
path_cdot_json: vec![PathBuf::from(
"tests/data/db/create/txs/cdot-0.2.12.refseq.grch37_grch38.brca1_opa1.json",
)],
Expand Down
2 changes: 1 addition & 1 deletion src/verify/seqvars.rs
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ pub fn run(_common: &crate::common::Args, args: &Args) -> Result<(), anyhow::Err
// Read the serialized transcripts.
tracing::info!("Opening transcript database");
let tx_db = load_tx_db(&format!(
"{}/seqvars/{}/txs.bin",
"{}/{}/txs.bin.zst",
&args.path_db,
path_component(assembly)
))?;
Expand Down
3 changes: 3 additions & 0 deletions tests/data/annotate/db/grch37/bootstrap.sh
Git LFS file not shown
3 changes: 3 additions & 0 deletions tests/data/annotate/db/grch37/seqvars/clinvar/LOG
Git LFS file not shown
File renamed without changes.
3 changes: 3 additions & 0 deletions tests/data/annotate/db/grch37/txs.bin.zst
Git LFS file not shown
19 changes: 0 additions & 19 deletions tests/data/annotate/db/seqvars/grch37/bootstrap.sh

This file was deleted.

3 changes: 0 additions & 3 deletions tests/data/annotate/db/seqvars/grch37/clinvar/LOG

This file was deleted.

3 changes: 0 additions & 3 deletions tests/data/annotate/db/seqvars/grch37/txs.bin

This file was deleted.

0 comments on commit b26671f

Please sign in to comment.