diff --git a/README.md b/README.md index 1790bf18..e4ba7a40 100644 --- a/README.md +++ b/README.md @@ -165,7 +165,7 @@ cd ../mehari cargo run --release -- \ -v \ db create txs \ - --path-out /tmp/txs-out.bin \ + --path-out /tmp/txs-out.bin.zst \ --path-cdot-json ../cdot-0.2.12.ensembl.grch37_grch38.json.gz \ --path-cdot-json ../cdot-0.2.12.refseq.grch37_grch38.json.gz \ --path-seqrepo-instance ../hgvs-rs-data/seqrepo-data/master/master diff --git a/docs/db_build.md b/docs/db_build.md index 91675dc6..d56d3345 100644 --- a/docs/db_build.md +++ b/docs/db_build.md @@ -143,7 +143,7 @@ You can build the transcript database protocolbuffers binary using the following ```text $ mehari db create txs \ - --path-out output/b37/txs.bin \ + --path-out output/db/grch37/txs.bin.zst \ \ --path-seqrepo-instance path/to/seqrepo-data/master \ \ @@ -158,7 +158,7 @@ $ mehari db create txs \ You will have to build the transcript database for each genome release that you want and manually specify the release to `--genome-release`. For GRCh38, simply use `--genome-release grch38`. -You can enable compression by using the suffix `.gz` for gzip compression and `.zstd` for zstandard compression. +You can enable compression by using the suffix `.gz` for gzip compression and `.zst` for zstandard compression. # Building ClinVar Database @@ -166,7 +166,7 @@ This assumes that you have converted a recent ClinVar XML file to TSV using [cli ``` $ mehari db create seqvar-clinvar \ - --path-output-db ~/Data/mehari/db/seqvars/grch37/clinvar \ + --path-output-db ~/Data/mehari/db/grch37/seqsvars/clinvar \ --path-clinvar-tsv path/to/clinvar_seqvars.b37.tsv.gz ``` diff --git a/src/annotate/seqvars/csq.rs b/src/annotate/seqvars/csq.rs index cebb2cc2..b6956336 100644 --- a/src/annotate/seqvars/csq.rs +++ b/src/annotate/seqvars/csq.rs @@ -702,7 +702,7 @@ mod test { #[test] fn annotate_snv_brca1_one_variant() -> Result<(), anyhow::Error> { - let tx_path = "tests/data/annotate/db/seqvars/grch37/txs.bin"; + let tx_path = "tests/data/annotate/db/grch37/txs.bin.zst"; let tx_db = load_tx_db(tx_path)?; let provider = Rc::new(MehariProvider::new(tx_db, Assembly::Grch37p10)); @@ -829,7 +829,7 @@ mod test { } fn annotate_vars(path_tsv: &str, txs: &[String]) -> Result<(), anyhow::Error> { - let tx_path = "tests/data/annotate/db/seqvars/grch37/txs.bin"; + let tx_path = "tests/data/annotate/db/grch37/txs.bin.zst"; let tx_db = load_tx_db(tx_path)?; let provider = Rc::new(MehariProvider::new(tx_db, Assembly::Grch37p10)); let predictor = ConsequencePredictor::new(provider, Assembly::Grch37p10); diff --git a/src/annotate/seqvars/mod.rs b/src/annotate/seqvars/mod.rs index d12eb00f..84b190b4 100644 --- a/src/annotate/seqvars/mod.rs +++ b/src/annotate/seqvars/mod.rs @@ -105,7 +105,7 @@ pub struct PathOutput { pub path_output_vcf: Option, /// Path to the output TSV file (for import into VarFish). - #[arg(long, requires = "path-input-ped")] + #[arg(long)] pub path_output_tsv: Option, } @@ -489,7 +489,7 @@ pub fn load_tx_db(tx_path: &str) -> Result { .map_err(|e| anyhow!("failed to open file {}: {}", tx_path, e))?; let mut reader: Box = if tx_path.ends_with(".gz") { Box::new(flate2::read::MultiGzDecoder::new(file)) - } else if tx_path.ends_with(".zstd") { + } else if tx_path.ends_with(".zst") { Box::new( zstd::Decoder::new(file) .map_err(|e| anyhow!("failed to open zstd decoder for {}: {}", tx_path, e))?, @@ -499,11 +499,9 @@ pub fn load_tx_db(tx_path: &str) -> Result { }; // Now read the whole file into a byte buffer. - let metadata = std::fs::metadata(tx_path) - .map_err(|e| anyhow!("failed to get metadata for {}: {}", tx_path, e))?; - let mut buffer = vec![0; metadata.len() as usize]; + let mut buffer = Vec::new(); reader - .read(&mut buffer) + .read_to_end(&mut buffer) .map_err(|e| anyhow!("failed to read file {}: {}", tx_path, e))?; // Deserialize the buffer with prost. @@ -1398,7 +1396,7 @@ fn run_with_writer(writer: &mut dyn AnnotatedVcfWriter, args: &Args) -> Result<( // Open the frequency RocksDB database in read only mode. tracing::info!("Opening frequency database"); let rocksdb_path = format!( - "{}/seqvars/{}/freqs", + "{}/{}/seqvars/freqs", &args.path_db, path_component(assembly) ); @@ -1418,7 +1416,7 @@ fn run_with_writer(writer: &mut dyn AnnotatedVcfWriter, args: &Args) -> Result<( // Open the ClinVar RocksDB database in read only mode. tracing::info!("Opening ClinVar database"); let rocksdb_path = format!( - "{}/seqvars/{}/clinvar", + "{}/{}/seqvars/clinvar", &args.path_db, path_component(assembly) ); @@ -1436,7 +1434,7 @@ fn run_with_writer(writer: &mut dyn AnnotatedVcfWriter, args: &Args) -> Result<( // Open the serialized transcripts. tracing::info!("Opening transcript database"); let tx_db = load_tx_db(&format!( - "{}/seqvars/{}/txs.bin", + "{}/{}/txs.bin.zst", &args.path_db, path_component(assembly) ))?; diff --git a/src/annotate/strucvars/mod.rs b/src/annotate/strucvars/mod.rs index 80419709..5dd434c4 100644 --- a/src/annotate/strucvars/mod.rs +++ b/src/annotate/strucvars/mod.rs @@ -2591,7 +2591,7 @@ fn run_with_writer( let file_date = args .file_date .as_ref() - .map(|v| v.clone()) + .cloned() .unwrap_or(Utc::now().date_naive().format("%Y%m%d").to_string()); let header_out = vcf_header::build( args.genome_release @@ -2787,7 +2787,6 @@ mod test { use rstest::rstest; use std::fs::File; - use chrono::NaiveDate; use clap_verbosity_flag::Verbosity; use hgvs::static_data::Assembly; use linked_hash_map::LinkedHashMap; diff --git a/src/db/create/txs/mod.rs b/src/db/create/txs/mod.rs index 7530a0c8..b27ef889 100644 --- a/src/db/create/txs/mod.rs +++ b/src/db/create/txs/mod.rs @@ -481,7 +481,7 @@ fn build_protobuf( file, flate2::Compression::default(), )) - } else if path_out.ends_with(".zstd") { + } else if path_out.ends_with(".zst") { Box::new(zstd::Encoder::new(file, 0).map_err(|e| { anyhow!( "failed to open zstd enoder for {}: {}", @@ -856,7 +856,7 @@ pub mod test { verbose: Verbosity::new(0, 1), }; let args = Args { - path_out: tmp_dir.join("out.bin"), + path_out: tmp_dir.join("out.bin.zst"), path_cdot_json: vec![PathBuf::from( "tests/data/db/create/txs/cdot-0.2.12.refseq.grch37_grch38.brca1_opa1.json", )], diff --git a/src/verify/seqvars.rs b/src/verify/seqvars.rs index a65f1507..da461401 100644 --- a/src/verify/seqvars.rs +++ b/src/verify/seqvars.rs @@ -120,7 +120,7 @@ pub fn run(_common: &crate::common::Args, args: &Args) -> Result<(), anyhow::Err // Read the serialized transcripts. tracing::info!("Opening transcript database"); let tx_db = load_tx_db(&format!( - "{}/seqvars/{}/txs.bin", + "{}/{}/txs.bin.zst", &args.path_db, path_component(assembly) ))?; diff --git a/tests/data/annotate/db/grch37/bootstrap.sh b/tests/data/annotate/db/grch37/bootstrap.sh new file mode 100644 index 00000000..b2457a5f --- /dev/null +++ b/tests/data/annotate/db/grch37/bootstrap.sh @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1865e5575504e4f004ba02d666515d79a8eccfd7f61cfa66d6b486bc249aa33e +size 442 diff --git a/tests/data/annotate/db/seqvars/grch37/clinvar/000012.log b/tests/data/annotate/db/grch37/seqvars/clinvar/000012.log similarity index 100% rename from tests/data/annotate/db/seqvars/grch37/clinvar/000012.log rename to tests/data/annotate/db/grch37/seqvars/clinvar/000012.log diff --git a/tests/data/annotate/db/seqvars/grch37/clinvar/000013.sst b/tests/data/annotate/db/grch37/seqvars/clinvar/000013.sst similarity index 100% rename from tests/data/annotate/db/seqvars/grch37/clinvar/000013.sst rename to tests/data/annotate/db/grch37/seqvars/clinvar/000013.sst diff --git a/tests/data/annotate/db/seqvars/grch37/clinvar/000014.sst b/tests/data/annotate/db/grch37/seqvars/clinvar/000014.sst similarity index 100% rename from tests/data/annotate/db/seqvars/grch37/clinvar/000014.sst rename to tests/data/annotate/db/grch37/seqvars/clinvar/000014.sst diff --git a/tests/data/annotate/db/seqvars/grch37/clinvar/CURRENT b/tests/data/annotate/db/grch37/seqvars/clinvar/CURRENT similarity index 100% rename from tests/data/annotate/db/seqvars/grch37/clinvar/CURRENT rename to tests/data/annotate/db/grch37/seqvars/clinvar/CURRENT diff --git a/tests/data/annotate/db/seqvars/grch37/clinvar/IDENTITY b/tests/data/annotate/db/grch37/seqvars/clinvar/IDENTITY similarity index 100% rename from tests/data/annotate/db/seqvars/grch37/clinvar/IDENTITY rename to tests/data/annotate/db/grch37/seqvars/clinvar/IDENTITY diff --git a/tests/data/annotate/db/seqvars/grch37/clinvar/LOCK b/tests/data/annotate/db/grch37/seqvars/clinvar/LOCK similarity index 100% rename from tests/data/annotate/db/seqvars/grch37/clinvar/LOCK rename to tests/data/annotate/db/grch37/seqvars/clinvar/LOCK diff --git a/tests/data/annotate/db/grch37/seqvars/clinvar/LOG b/tests/data/annotate/db/grch37/seqvars/clinvar/LOG new file mode 100644 index 00000000..e6ebbda2 --- /dev/null +++ b/tests/data/annotate/db/grch37/seqvars/clinvar/LOG @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8059e237679e071f583e988e16f284346f61c0c4931d132beed8e378e512622b +size 55676 diff --git a/tests/data/annotate/db/seqvars/grch37/clinvar/MANIFEST-000005 b/tests/data/annotate/db/grch37/seqvars/clinvar/MANIFEST-000005 similarity index 100% rename from tests/data/annotate/db/seqvars/grch37/clinvar/MANIFEST-000005 rename to tests/data/annotate/db/grch37/seqvars/clinvar/MANIFEST-000005 diff --git a/tests/data/annotate/db/seqvars/grch37/clinvar/OPTIONS-000009 b/tests/data/annotate/db/grch37/seqvars/clinvar/OPTIONS-000009 similarity index 100% rename from tests/data/annotate/db/seqvars/grch37/clinvar/OPTIONS-000009 rename to tests/data/annotate/db/grch37/seqvars/clinvar/OPTIONS-000009 diff --git a/tests/data/annotate/db/seqvars/grch37/clinvar/OPTIONS-000011 b/tests/data/annotate/db/grch37/seqvars/clinvar/OPTIONS-000011 similarity index 100% rename from tests/data/annotate/db/seqvars/grch37/clinvar/OPTIONS-000011 rename to tests/data/annotate/db/grch37/seqvars/clinvar/OPTIONS-000011 diff --git a/tests/data/annotate/db/seqvars/grch37/freqs b/tests/data/annotate/db/grch37/seqvars/freqs similarity index 100% rename from tests/data/annotate/db/seqvars/grch37/freqs rename to tests/data/annotate/db/grch37/seqvars/freqs diff --git a/tests/data/annotate/db/grch37/txs.bin.zst b/tests/data/annotate/db/grch37/txs.bin.zst new file mode 100644 index 00000000..3888b219 --- /dev/null +++ b/tests/data/annotate/db/grch37/txs.bin.zst @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:850ada15b1fdfcb89cad96093583b34786d684ced7ba8d9cd977a8f05fab7879 +size 8556 diff --git a/tests/data/annotate/db/seqvars/grch37/txs.bin.report b/tests/data/annotate/db/grch37/txs.bin.zst.report similarity index 100% rename from tests/data/annotate/db/seqvars/grch37/txs.bin.report rename to tests/data/annotate/db/grch37/txs.bin.zst.report diff --git a/tests/data/annotate/db/seqvars/grch37/bootstrap.sh b/tests/data/annotate/db/seqvars/grch37/bootstrap.sh deleted file mode 100644 index 47ac944f..00000000 --- a/tests/data/annotate/db/seqvars/grch37/bootstrap.sh +++ /dev/null @@ -1,19 +0,0 @@ -# Extract transcripts for BRCA1 from the GRCh37 reference genome - -cargo \ - run \ - --release \ - -- \ - db \ - create \ - txs \ - --path-out \ - tests/data/annotate/db/seqvars/grch37/txs.bin \ - --path-cdot-json \ - ../cdot-0.2.12.refseq.grch37_grch38.json \ - --path-seqrepo-instance \ - ../hgvs-rs-data/seqrepo-data/master/master \ - --genome-release \ - grch37 \ - --gene-symbols BRCA1 \ - --gene-symbols OPA1 diff --git a/tests/data/annotate/db/seqvars/grch37/clinvar/LOG b/tests/data/annotate/db/seqvars/grch37/clinvar/LOG deleted file mode 100644 index 5e673761..00000000 --- a/tests/data/annotate/db/seqvars/grch37/clinvar/LOG +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9fe51512c988c1c1e729c9c92b259d6b604ee0c767286c29c3e6408201bbcb2e -size 55685 diff --git a/tests/data/annotate/db/seqvars/grch37/txs.bin b/tests/data/annotate/db/seqvars/grch37/txs.bin deleted file mode 100644 index 6f69b8b1..00000000 --- a/tests/data/annotate/db/seqvars/grch37/txs.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fe1da10491ab4027ce9a389021b06d7977447ade7f2754d3d073c3981acbd532 -size 99309