feat: change database path setup (#48) (#60)

varfish-org · Apr 24, 2023 · b26671f · b26671f
1 parent 262e86d
commit b26671f
Show file tree

Hide file tree

Showing 24 changed files with 26 additions and 45 deletions.
diff --git a/README.md b/README.md
@@ -165,7 +165,7 @@ cd ../mehari
 cargo run --release -- \
     -v \
     db create txs \
-        --path-out /tmp/txs-out.bin \
+        --path-out /tmp/txs-out.bin.zst \
         --path-cdot-json ../cdot-0.2.12.ensembl.grch37_grch38.json.gz \
         --path-cdot-json ../cdot-0.2.12.refseq.grch37_grch38.json.gz \
         --path-seqrepo-instance ../hgvs-rs-data/seqrepo-data/master/master

diff --git a/docs/db_build.md b/docs/db_build.md
@@ -143,7 +143,7 @@ You can build the transcript database protocolbuffers binary using the following
 
 ```text
 $ mehari db create txs \
-    --path-out output/b37/txs.bin \
+    --path-out output/db/grch37/txs.bin.zst \
     \
     --path-seqrepo-instance path/to/seqrepo-data/master \
     \
@@ -158,15 +158,15 @@ $ mehari db create txs \
 You will have to build the transcript database for each genome release that you want and manually specify the release to `--genome-release`.
 For GRCh38, simply use `--genome-release grch38`.
 
-You can enable compression by using the suffix `.gz` for gzip compression and `.zstd` for zstandard compression.
+You can enable compression by using the suffix `.gz` for gzip compression and `.zst` for zstandard compression.
 
 # Building ClinVar Database
 
 This assumes that you have converted a recent ClinVar XML file to TSV using [clinvar-tsv](https://github.com/bihealth/clinvar-tsv).
 
 ```
 $ mehari db create seqvar-clinvar \
-    --path-output-db ~/Data/mehari/db/seqvars/grch37/clinvar \
+    --path-output-db ~/Data/mehari/db/grch37/seqsvars/clinvar \
     --path-clinvar-tsv path/to/clinvar_seqvars.b37.tsv.gz
 ```
 

diff --git a/src/annotate/seqvars/csq.rs b/src/annotate/seqvars/csq.rs
@@ -702,7 +702,7 @@ mod test {
 
     #[test]
     fn annotate_snv_brca1_one_variant() -> Result<(), anyhow::Error> {
-        let tx_path = "tests/data/annotate/db/seqvars/grch37/txs.bin";
+        let tx_path = "tests/data/annotate/db/grch37/txs.bin.zst";
         let tx_db = load_tx_db(tx_path)?;
         let provider = Rc::new(MehariProvider::new(tx_db, Assembly::Grch37p10));
 
@@ -829,7 +829,7 @@ mod test {
     }
 
     fn annotate_vars(path_tsv: &str, txs: &[String]) -> Result<(), anyhow::Error> {
-        let tx_path = "tests/data/annotate/db/seqvars/grch37/txs.bin";
+        let tx_path = "tests/data/annotate/db/grch37/txs.bin.zst";
         let tx_db = load_tx_db(tx_path)?;
         let provider = Rc::new(MehariProvider::new(tx_db, Assembly::Grch37p10));
         let predictor = ConsequencePredictor::new(provider, Assembly::Grch37p10);

diff --git a/src/annotate/seqvars/mod.rs b/src/annotate/seqvars/mod.rs
@@ -105,7 +105,7 @@ pub struct PathOutput {
     pub path_output_vcf: Option<String>,
 
     /// Path to the output TSV file (for import into VarFish).
-    #[arg(long, requires = "path-input-ped")]
+    #[arg(long)]
     pub path_output_tsv: Option<String>,
 }
 
@@ -489,7 +489,7 @@ pub fn load_tx_db(tx_path: &str) -> Result<TxSeqDatabase, anyhow::Error> {
         .map_err(|e| anyhow!("failed to open file {}: {}", tx_path, e))?;
     let mut reader: Box<dyn Read> = if tx_path.ends_with(".gz") {
         Box::new(flate2::read::MultiGzDecoder::new(file))
-    } else if tx_path.ends_with(".zstd") {
+    } else if tx_path.ends_with(".zst") {
         Box::new(
             zstd::Decoder::new(file)
                 .map_err(|e| anyhow!("failed to open zstd decoder for {}: {}", tx_path, e))?,
@@ -499,11 +499,9 @@ pub fn load_tx_db(tx_path: &str) -> Result<TxSeqDatabase, anyhow::Error> {
     };
 
     // Now read the whole file into a byte buffer.
-    let metadata = std::fs::metadata(tx_path)
-        .map_err(|e| anyhow!("failed to get metadata for {}: {}", tx_path, e))?;
-    let mut buffer = vec![0; metadata.len() as usize];
+    let mut buffer = Vec::new();
     reader
-        .read(&mut buffer)
+        .read_to_end(&mut buffer)
         .map_err(|e| anyhow!("failed to read file {}: {}", tx_path, e))?;
 
     // Deserialize the buffer with prost.
@@ -1398,7 +1396,7 @@ fn run_with_writer(writer: &mut dyn AnnotatedVcfWriter, args: &Args) -> Result<(
     // Open the frequency RocksDB database in read only mode.
     tracing::info!("Opening frequency database");
     let rocksdb_path = format!(
-        "{}/seqvars/{}/freqs",
+        "{}/{}/seqvars/freqs",
         &args.path_db,
         path_component(assembly)
     );
@@ -1418,7 +1416,7 @@ fn run_with_writer(writer: &mut dyn AnnotatedVcfWriter, args: &Args) -> Result<(
     // Open the ClinVar RocksDB database in read only mode.
     tracing::info!("Opening ClinVar database");
     let rocksdb_path = format!(
-        "{}/seqvars/{}/clinvar",
+        "{}/{}/seqvars/clinvar",
         &args.path_db,
         path_component(assembly)
     );
@@ -1436,7 +1434,7 @@ fn run_with_writer(writer: &mut dyn AnnotatedVcfWriter, args: &Args) -> Result<(
     // Open the serialized transcripts.
     tracing::info!("Opening transcript database");
     let tx_db = load_tx_db(&format!(
-        "{}/seqvars/{}/txs.bin",
+        "{}/{}/txs.bin.zst",
         &args.path_db,
         path_component(assembly)
     ))?;

diff --git a/src/annotate/strucvars/mod.rs b/src/annotate/strucvars/mod.rs
@@ -2591,7 +2591,7 @@ fn run_with_writer(
     let file_date = args
         .file_date
         .as_ref()
-        .map(|v| v.clone())
+        .cloned()
         .unwrap_or(Utc::now().date_naive().format("%Y%m%d").to_string());
     let header_out = vcf_header::build(
         args.genome_release
@@ -2787,7 +2787,6 @@ mod test {
     use rstest::rstest;
     use std::fs::File;
 
-    use chrono::NaiveDate;
     use clap_verbosity_flag::Verbosity;
     use hgvs::static_data::Assembly;
     use linked_hash_map::LinkedHashMap;

diff --git a/src/db/create/txs/mod.rs b/src/db/create/txs/mod.rs
@@ -481,7 +481,7 @@ fn build_protobuf(
             file,
             flate2::Compression::default(),
         ))
-    } else if path_out.ends_with(".zstd") {
+    } else if path_out.ends_with(".zst") {
         Box::new(zstd::Encoder::new(file, 0).map_err(|e| {
             anyhow!(
                 "failed to open zstd enoder for {}: {}",
@@ -856,7 +856,7 @@ pub mod test {
             verbose: Verbosity::new(0, 1),
         };
         let args = Args {
-            path_out: tmp_dir.join("out.bin"),
+            path_out: tmp_dir.join("out.bin.zst"),
             path_cdot_json: vec![PathBuf::from(
                 "tests/data/db/create/txs/cdot-0.2.12.refseq.grch37_grch38.brca1_opa1.json",
             )],

diff --git a/src/verify/seqvars.rs b/src/verify/seqvars.rs
@@ -120,7 +120,7 @@ pub fn run(_common: &crate::common::Args, args: &Args) -> Result<(), anyhow::Err
     // Read the serialized transcripts.
     tracing::info!("Opening transcript database");
     let tx_db = load_tx_db(&format!(
-        "{}/seqvars/{}/txs.bin",
+        "{}/{}/txs.bin.zst",
         &args.path_db,
         path_component(assembly)
     ))?;

diff --git a/tests/data/annotate/db/grch37/bootstrap.sh b/tests/data/annotate/db/grch37/bootstrap.sh
diff --git a/...tate/db/seqvars/grch37/clinvar/000012.log → ...tate/db/grch37/seqvars/clinvar/000012.log b/...tate/db/seqvars/grch37/clinvar/000012.log → ...tate/db/grch37/seqvars/clinvar/000012.log
diff --git a/...tate/db/seqvars/grch37/clinvar/000013.sst → ...tate/db/grch37/seqvars/clinvar/000013.sst b/...tate/db/seqvars/grch37/clinvar/000013.sst → ...tate/db/grch37/seqvars/clinvar/000013.sst
diff --git a/...tate/db/seqvars/grch37/clinvar/000014.sst → ...tate/db/grch37/seqvars/clinvar/000014.sst b/...tate/db/seqvars/grch37/clinvar/000014.sst → ...tate/db/grch37/seqvars/clinvar/000014.sst
diff --git a/...nnotate/db/seqvars/grch37/clinvar/CURRENT → ...nnotate/db/grch37/seqvars/clinvar/CURRENT b/...nnotate/db/seqvars/grch37/clinvar/CURRENT → ...nnotate/db/grch37/seqvars/clinvar/CURRENT
diff --git a/...notate/db/seqvars/grch37/clinvar/IDENTITY → ...notate/db/grch37/seqvars/clinvar/IDENTITY b/...notate/db/seqvars/grch37/clinvar/IDENTITY → ...notate/db/grch37/seqvars/clinvar/IDENTITY
diff --git a/...a/annotate/db/seqvars/grch37/clinvar/LOCK → ...a/annotate/db/grch37/seqvars/clinvar/LOCK b/...a/annotate/db/seqvars/grch37/clinvar/LOCK → ...a/annotate/db/grch37/seqvars/clinvar/LOCK
diff --git a/tests/data/annotate/db/grch37/seqvars/clinvar/LOG b/tests/data/annotate/db/grch37/seqvars/clinvar/LOG
diff --git a/...db/seqvars/grch37/clinvar/MANIFEST-000005 → ...db/grch37/seqvars/clinvar/MANIFEST-000005 b/...db/seqvars/grch37/clinvar/MANIFEST-000005 → ...db/grch37/seqvars/clinvar/MANIFEST-000005
diff --git a/.../db/seqvars/grch37/clinvar/OPTIONS-000009 → .../db/grch37/seqvars/clinvar/OPTIONS-000009 b/.../db/seqvars/grch37/clinvar/OPTIONS-000009 → .../db/grch37/seqvars/clinvar/OPTIONS-000009
diff --git a/.../db/seqvars/grch37/clinvar/OPTIONS-000011 → .../db/grch37/seqvars/clinvar/OPTIONS-000011 b/.../db/seqvars/grch37/clinvar/OPTIONS-000011 → .../db/grch37/seqvars/clinvar/OPTIONS-000011
diff --git a/tests/data/annotate/db/seqvars/grch37/freqs → tests/data/annotate/db/grch37/seqvars/freqs b/tests/data/annotate/db/seqvars/grch37/freqs → tests/data/annotate/db/grch37/seqvars/freqs
diff --git a/tests/data/annotate/db/grch37/txs.bin.zst b/tests/data/annotate/db/grch37/txs.bin.zst
diff --git a/...annotate/db/seqvars/grch37/txs.bin.report → ...ata/annotate/db/grch37/txs.bin.zst.report b/...annotate/db/seqvars/grch37/txs.bin.report → ...ata/annotate/db/grch37/txs.bin.zst.report
diff --git a/tests/data/annotate/db/seqvars/grch37/bootstrap.sh b/tests/data/annotate/db/seqvars/grch37/bootstrap.sh
diff --git a/tests/data/annotate/db/seqvars/grch37/clinvar/LOG b/tests/data/annotate/db/seqvars/grch37/clinvar/LOG
diff --git a/tests/data/annotate/db/seqvars/grch37/txs.bin b/tests/data/annotate/db/seqvars/grch37/txs.bin