diff --git a/.cargo/config b/.cargo/config.toml similarity index 100% rename from .cargo/config rename to .cargo/config.toml diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..deb891c --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,57 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "type": "lldb", + "request": "launch", + "name": "Debug executable 'viguno'", + "cargo": { + "args": [ + "build", + "--bin=viguno", + "--package=viguno" + ], + "filter": { + "name": "viguno", + "kind": "bin" + } + }, + "args": [ + "simulate", + "--ic-base", "gene", + "--similarity", "resnik", + "--combiner", "fun-sim-avg", + "--path-hpo-dir", "/home/holtgrem_c/Development/varfish-db-downloader/work/download/hpo/20240111", + "--path-out-rocksdb", "/tmp/viguno-rocksdb", + "--min-terms", "1", + "--max-terms", "1", + "--num-simulations", "10", + "--only-gene", "ARID1B", + "--seed", "42" + ], + "cwd": "${workspaceFolder}" + }, + { + "type": "lldb", + "request": "launch", + "name": "Debug unit tests in executable 'viguno'", + "cargo": { + "args": [ + "test", + "--no-run", + "--bin=viguno", + "--package=viguno" + ], + "filter": { + "name": "viguno", + "kind": "bin" + } + }, + "args": [], + "cwd": "${workspaceFolder}" + } + ] +} diff --git a/Cargo.lock b/Cargo.lock index ceab5b4..ea4c723 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -359,27 +359,6 @@ version = "0.21.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "35636a1494ede3b646cc98f74f8e62c773a38a659ebc777a2cf26b9b74171df9" -[[package]] -name = "bindgen" -version = "0.65.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfdf7b466f9a4903edc73f95d6d2bcd5baf8ae620638762244d3f60143643cc5" -dependencies = [ - "bitflags 1.3.2", - "cexpr", - "clang-sys", - "lazy_static", - "lazycell", - "peeking_take_while", - "prettyplease", - "proc-macro2", - "quote", - "regex", - "rustc-hash", - "shlex", - "syn 2.0.48", -] - [[package]] name = "bitflags" version = "1.3.2" @@ -475,17 +454,6 @@ dependencies = [ "bytes", ] -[[package]] -name = "bzip2-sys" -version = "0.1.11+1.0.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" -dependencies = [ - "cc", - "libc", - "pkg-config", -] - [[package]] name = "cc" version = "1.0.83" @@ -502,15 +470,6 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fafee10a5dd1cffcb5cc560e0d0df8803d7355a2b12272e3557dee57314cb6e" -[[package]] -name = "cexpr" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" -dependencies = [ - "nom", -] - [[package]] name = "cfg-if" version = "1.0.0" @@ -527,17 +486,6 @@ dependencies = [ "serde", ] -[[package]] -name = "clang-sys" -version = "1.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c688fc74432808e3eb684cae8830a86be1d66a2bd58e1f248ed0960a590baf6f" -dependencies = [ - "glob", - "libc", - "libloading", -] - [[package]] name = "clap" version = "4.4.18" @@ -971,34 +919,99 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba" +[[package]] +name = "futures" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78" +dependencies = [ + "futures-core", + "futures-sink", +] + [[package]] name = "futures-core" -version = "0.3.29" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb1d22c66e66d9d72e1758f0bd7d4fd0bee04cad842ee34587d68c07e45d088c" +checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" + +[[package]] +name = "futures-executor" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a576fc72ae164fca6b9db127eaa9a9dda0d61316034f33a0a0d4eda41f02b01d" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" + +[[package]] +name = "futures-macro" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.48", +] [[package]] name = "futures-sink" -version = "0.3.29" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e36d3378ee38c2a36ad710c5d30c2911d752cb941c00c72dbabfb786a7970817" +checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" [[package]] name = "futures-task" -version = "0.3.29" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" + +[[package]] +name = "futures-timer" +version = "3.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "efd193069b0ddadc69c46389b740bbccdd97203899b48d09c5f7969591d6bae2" +checksum = "f288b0a4f20f9a56b5d1da57e2227c661b7b16168e2f72365f57b63326e29b24" [[package]] name = "futures-util" -version = "0.3.29" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a19526d624e703a3179b3d322efec918b6246ea0fa51d41124525f00f1cc8104" +checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" dependencies = [ + "futures-channel", "futures-core", + "futures-io", + "futures-macro", + "futures-sink", "futures-task", + "memchr", "pin-project-lite", "pin-utils", + "slab", ] [[package]] @@ -1292,12 +1305,6 @@ version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" -[[package]] -name = "lazycell" -version = "1.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" - [[package]] name = "levenshtein_automata" version = "0.2.1" @@ -1310,16 +1317,6 @@ version = "0.2.151" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "302d7ab3130588088d277783b1e2d2e10c9e9e4a16dd9050e6ec93fb3e7048f4" -[[package]] -name = "libloading" -version = "0.7.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f" -dependencies = [ - "cfg-if", - "winapi", -] - [[package]] name = "libm" version = "0.2.8" @@ -1337,33 +1334,6 @@ dependencies = [ "redox_syscall", ] -[[package]] -name = "librocksdb-sys" -version = "0.11.0+8.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3386f101bcb4bd252d8e9d2fb41ec3b0862a15a62b478c355b2982efa469e3e" -dependencies = [ - "bindgen", - "bzip2-sys", - "cc", - "glob", - "libc", - "libz-sys", - "lz4-sys", - "zstd-sys", -] - -[[package]] -name = "libz-sys" -version = "1.1.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d97137b25e321a73eef1418d1d5d2eda4d77e12813f8e6dead84bc52c5870a7b" -dependencies = [ - "cc", - "pkg-config", - "vcpkg", -] - [[package]] name = "linked-hash-map" version = "0.5.6" @@ -1432,16 +1402,6 @@ dependencies = [ "hashbrown", ] -[[package]] -name = "lz4-sys" -version = "1.9.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57d27b317e207b10f69f5e75494119e391a96f48861ae870d1da6edac98ca900" -dependencies = [ - "cc", - "libc", -] - [[package]] name = "lz4_flex" version = "0.11.1" @@ -1734,12 +1694,6 @@ version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c" -[[package]] -name = "peeking_take_while" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" - [[package]] name = "percent-encoding" version = "2.3.1" @@ -1847,6 +1801,15 @@ dependencies = [ "syn 2.0.48", ] +[[package]] +name = "proc-macro-crate" +version = "3.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d37c51ca738a55da99dc0c4a34860fd675453b8b36209178c2249bb13651284" +dependencies = [ + "toml_edit", +] + [[package]] name = "proc-macro-error" version = "1.0.4" @@ -2110,6 +2073,12 @@ version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" +[[package]] +name = "relative-path" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba39f3699c378cd8970968dcbff9c43159ea4cfbd88d43c00b22f2ef10a435d2" + [[package]] name = "remove_dir_all" version = "0.5.3" @@ -2120,24 +2089,33 @@ dependencies = [ ] [[package]] -name = "rocksdb" +name = "rstest" version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb6f170a4041d50a0ce04b0d2e14916d6ca863ea2e422689a5b694395d299ffe" +checksum = "9afd55a67069d6e434a95161415f5beeada95a01c7b815508a82dcb0e1593682" dependencies = [ - "libc", - "librocksdb-sys", + "futures", + "futures-timer", + "rstest_macros", + "rustc_version", ] [[package]] -name = "rocksdb-utils-lookup" -version = "0.3.0" +name = "rstest_macros" +version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b83002edb508bf7fc5b234bde1489ccaea5bffbaeaf0aae00270257c858b5f9f" +checksum = "4165dfae59a39dd41d8dec720d3cbfbc71f69744efb480a3920f5d4e0cc6798d" dependencies = [ - "rocksdb", - "thiserror", - "tracing", + "cfg-if", + "glob", + "proc-macro-crate", + "proc-macro2", + "quote", + "regex", + "relative-path", + "rustc_version", + "syn 2.0.48", + "unicode-ident", ] [[package]] @@ -2343,12 +2321,6 @@ dependencies = [ "dirs", ] -[[package]] -name = "shlex" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7cee0529a6d40f580e7a5e6c495c8fbfe21b7b52795ed4bb5e62cdf92bc6380" - [[package]] name = "signal-hook-registry" version = "1.4.1" @@ -2780,6 +2752,23 @@ dependencies = [ "tracing", ] +[[package]] +name = "toml_datetime" +version = "0.6.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4badfd56924ae69bcc9039335b2e017639ce3f9b001c393c1b2d1ef846ce2cbf" + +[[package]] +name = "toml_edit" +version = "0.21.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a8534fd7f78b5405e860340ad6575217ce99f38d4d5c8f2442cb5ecb50090e1" +dependencies = [ + "indexmap", + "toml_datetime", + "winnow", +] + [[package]] name = "tracing" version = "0.1.40" @@ -2920,12 +2909,6 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" -[[package]] -name = "vcpkg" -version = "0.2.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" - [[package]] name = "version_check" version = "0.9.4" @@ -2960,8 +2943,7 @@ dependencies = [ "prost-build", "rayon", "regex", - "rocksdb", - "rocksdb-utils-lookup", + "rstest", "serde", "serde_json", "serde_test", @@ -3241,6 +3223,15 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" +[[package]] +name = "winnow" +version = "0.5.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f593a95398737aeed53e489c785df13f3618e41dbcd6718c6addbf1395aa6876" +dependencies = [ + "memchr", +] + [[package]] name = "yaml-rust" version = "0.4.5" diff --git a/Cargo.toml b/Cargo.toml index 91556d5..5a23ad7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,8 +30,6 @@ once_cell = "1.18" prost = "0.12" rayon = "1.8" regex = "1.10" -rocksdb = { version = "0.21", features = ["multi-threaded-cf"] } -rocksdb-utils-lookup = "0.3" serde = { version = "1.0", features = ["serde_derive"] } serde_json = "1.0" serde_with = { version = "3.6", features=["alloc", "macros", "indexmap_2"], default-features = false } @@ -50,6 +48,7 @@ prost-build = "0.12" [dev-dependencies] file_diff = "1.0" insta = { version = "1.34", features = ["yaml"] } +rstest = "0.21" serde_test = "1.0" temp_testdir = "0.2" diff --git a/README.md b/README.md index 8b40167..187d4f4 100644 --- a/README.md +++ b/README.md @@ -52,21 +52,6 @@ You can now conver the downloaded text HPO files to a binary format which will i --path-out-bin /tmp/data/hpo/hpo.bin ``` -To use the similarity computations, you will need to run some simulation as precomputation. -You should fix the seed for reproducibility. -The number of simulations should be high for production (the default is 100k) but you can reduce this for a local setup. - -``` -# viguno simulate \ - --num-simulations 10 \ - --seed 42 \ - --path-hpo-dir /tmp/data/hpo/hpo \ - --path-out-rocksdb /tmp/data/hpo/hpo/scores-fun-sim-avg-resnik-gene \ - --combiner fun-sim-avg \ - --similarity resnik \ - --ic-base gene -``` - ## Running the Server After having the precomputed data, you can startup the server as follows: diff --git a/src/algos/phenomizer.rs b/src/algos/phenomizer.rs index 969c27e..1a91761 100644 --- a/src/algos/phenomizer.rs +++ b/src/algos/phenomizer.rs @@ -40,7 +40,7 @@ fn score_dir(qs: &HpoGroup, ds: &HpoGroup, o: &Ontology, s: &impl Similarity) -> // NB: we allow loss of precision in this function for the following statement. let len: u16 = qs.len().try_into().expect("more than 2^16 query terms"); - let len: f32 = len.try_into().expect("too many query terms for f32"); + let len: f32 = len.into(); tmp.iter().sum::() / len } diff --git a/src/main.rs b/src/main.rs index 0d09ebb..da01c15 100644 --- a/src/main.rs +++ b/src/main.rs @@ -11,7 +11,6 @@ pub mod index; pub mod pbs; pub mod query; pub mod server; -pub mod simulate; use clap::{Parser, Subcommand}; @@ -42,7 +41,6 @@ enum Commands { Convert(crate::convert::Args), Query(crate::query::Args), RunServer(crate::server::Args), - Simulate(crate::simulate::Args), } fn main() -> Result<(), anyhow::Error> { @@ -76,9 +74,6 @@ fn main() -> Result<(), anyhow::Error> { Commands::RunServer(args) => { server::run(&cli.common, args)?; } - Commands::Simulate(args) => { - simulate::run(&cli.common, args)?; - } } Ok::<(), anyhow::Error>(()) diff --git a/src/query/mod.rs b/src/query/mod.rs index 6cfd4f7..8c66915 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -1,8 +1,6 @@ //! Code for ranking genes on the command line. use hpo::similarity::Builtins; -use prost::Message; -use rocksdb::{DBWithThreadMode, MultiThreaded}; use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::time::Instant; @@ -11,7 +9,6 @@ use clap::Parser; use hpo::{annotations::AnnotationId, term::HpoGroup, HpoTermId, Ontology}; use crate::algos::phenomizer; -use crate::pbs::simulation::SimulationResults; use crate::query::query_result::TermDetails; /// Command line arguments for `query` command. @@ -93,10 +90,8 @@ pub mod query_result { pub struct Record { /// The gene symbol. pub gene_symbol: String, - /// The estimate for empirical P-value - pub p_value: f32, - /// The score (`-10 * log10(p_value)`). - pub score: f32, + /// The raw Phenomizer score. + pub raw_score: f32, /// Details on individual terms. #[serde(default = "Option::default")] pub terms: Option>, @@ -142,17 +137,11 @@ pub fn run_query( patient: &HpoGroup, genes: &Vec<&hpo::annotations::Gene>, hpo: &Ontology, - db: &DBWithThreadMode, ncbi_to_hgnc: &HashMap, ) -> Result where S: std::hash::BuildHasher, { - let cf_resnik = db - .cf_handle("scores") - .expect("database is missing 'scores' column family"); - - let num_terms = std::cmp::min(10, patient.len()); let query = query_result::Query { terms: patient .iter() @@ -173,13 +162,8 @@ where }; for gene in genes { let ncbi_gene_id = gene.id().as_u32(); - let key = format!("{ncbi_gene_id}:{num_terms}"); - let data = db - .get_cf(&cf_resnik, key.as_bytes())? - .expect("key not found"); - let res = SimulationResults::decode(&data[..])?; tracing::debug!("gene = {:?}", gene); - let score = phenomizer::score( + let raw_score = phenomizer::score( patient, &gene .to_hpo_set(hpo) @@ -190,14 +174,6 @@ where hpo, ); - let lower_bound = res.scores[..].partition_point(|x| *x < score); - let upper_bound = res.scores[..].partition_point(|x| *x <= score); - let idx = (lower_bound + upper_bound) / 2; - let idx = std::cmp::min(idx, res.scores.len() - 1); - // NB: we accept loss of precision when converting to f64 below. - let p = 1.0 - (idx as f64) / (res.scores.len() as f64); - let log_p = -10.0 * p.log10(); - // For each term in the gene, provide query term with the highest similarity. let mut terms = gene .to_hpo_set(hpo) @@ -250,10 +226,7 @@ where result.result.push(query_result::Record { gene_symbol: gene.name().to_string(), - // NB: we accept value truncation here ... - p_value: p as f32, - // NB: ... and here. - score: log_p as f32, + raw_score, terms: Some(terms), }); } @@ -261,10 +234,10 @@ where // Sort genes for reproducibility. result.query.genes.sort(); - // Sort output records by score for reproducibility. + // Sort output records by raw score for reproducibility. result .result - .sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap()); + .sort_by(|a, b| b.raw_score.partial_cmp(&a.raw_score).unwrap()); Ok(result) } @@ -297,17 +270,6 @@ pub fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), anyhow: let hpo = crate::common::load_hpo(&args.path_hpo_dir)?; tracing::info!("...done loading HPO in {:?}", before_loading.elapsed()); - tracing::info!("Opening RocksDB for reading..."); - let before_rocksdb = Instant::now(); - let path_rocksdb = format!("{}/scores-fun-sim-avg-resnik-gene", args.path_hpo_dir); - let db = rocksdb::DB::open_cf_for_read_only( - &rocksdb::Options::default(), - &path_rocksdb, - ["meta", "scores"], - true, - )?; - tracing::info!("...done opening RocksDB in {:?}", before_rocksdb.elapsed()); - tracing::info!("Loading genes..."); let before_load_genes = Instant::now(); let genes_json = std::fs::read_to_string(&args.path_genes_json)?; @@ -358,7 +320,7 @@ pub fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), anyhow: tracing::info!("Starting priorization..."); let before_priorization = Instant::now(); - let result = run_query(&query, &genes, &hpo, &db, &ncbi_to_hgnc)?; + let result = run_query(&query, &genes, &hpo, &ncbi_to_hgnc)?; tracing::info!( "... done with prioritization in {:?}", before_priorization.elapsed() @@ -373,14 +335,13 @@ pub fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), anyhow: "P-value", "score" ); - tracing::info!(" | | |"); + tracing::info!(" | |"); for (i, gene) in result.result.iter().enumerate() { tracing::info!( - "{: >4} | {: <10} | {: >10.5} | {: >10.2}", + "{: >4} | {: <10} | {: >10.5}", i + 1, gene.gene_symbol, - gene.p_value, - gene.score + gene.raw_score ); } diff --git a/src/server/actix_server/hpo_genes.rs b/src/server/actix_server/hpo_genes.rs index 6403641..0e3afdc 100644 --- a/src/server/actix_server/hpo_genes.rs +++ b/src/server/actix_server/hpo_genes.rs @@ -197,7 +197,6 @@ mod test { actix_web::App::new() .app_data(actix_web::web::Data::new(crate::server::WebServerData { ontology, - db: None, ncbi_to_hgnc, hgnc_to_ncbi, full_text_index: crate::index::Index::new(hpo_doc)?, diff --git a/src/server/actix_server/hpo_omims.rs b/src/server/actix_server/hpo_omims.rs index e03188f..266e3e7 100644 --- a/src/server/actix_server/hpo_omims.rs +++ b/src/server/actix_server/hpo_omims.rs @@ -224,7 +224,6 @@ mod test { actix_web::App::new() .app_data(actix_web::web::Data::new(crate::server::WebServerData { ontology, - db: None, ncbi_to_hgnc, hgnc_to_ncbi, full_text_index: crate::index::Index::new(hpo_doc)?, diff --git a/src/server/actix_server/hpo_sim/snapshots/viguno__server__actix_server__hpo_sim__term_gene__test__hpo_sim_term_gene_terms_hgnc_gene_ids.snap b/src/server/actix_server/hpo_sim/snapshots/viguno__server__actix_server__hpo_sim__term_gene__test__hpo_sim_term_gene_terms_hgnc_gene_ids.snap index 9f5fbe9..3f0915d 100644 --- a/src/server/actix_server/hpo_sim/snapshots/viguno__server__actix_server__hpo_sim__term_gene__test__hpo_sim_term_gene_terms_hgnc_gene_ids.snap +++ b/src/server/actix_server/hpo_sim/snapshots/viguno__server__actix_server__hpo_sim__term_gene__test__hpo_sim_term_gene_terms_hgnc_gene_ids.snap @@ -20,8 +20,7 @@ query: hgnc_id: "HGNC:20324" result: - gene_symbol: TGDS - p_value: 0.1 - score: 10 + raw_score: 1.028543 terms: - term_query: term_id: "HP:0000347" @@ -417,8 +416,7 @@ result: term_name: Postnatal growth retardation score: 0 - gene_symbol: TTN - p_value: 0.3 - score: 5.2287874 + raw_score: 0.91585016 terms: - term_query: term_id: "HP:0000347" @@ -1070,4 +1068,3 @@ result: term_id: "HP:0030059" term_name: Mitochondrial depletion score: 0 - diff --git a/src/server/actix_server/hpo_sim/snapshots/viguno__server__actix_server__hpo_sim__term_gene__test__hpo_sim_term_gene_terms_ncbi_gene_ids.snap b/src/server/actix_server/hpo_sim/snapshots/viguno__server__actix_server__hpo_sim__term_gene__test__hpo_sim_term_gene_terms_ncbi_gene_ids.snap index 72de43d..70c16b8 100644 --- a/src/server/actix_server/hpo_sim/snapshots/viguno__server__actix_server__hpo_sim__term_gene__test__hpo_sim_term_gene_terms_ncbi_gene_ids.snap +++ b/src/server/actix_server/hpo_sim/snapshots/viguno__server__actix_server__hpo_sim__term_gene__test__hpo_sim_term_gene_terms_ncbi_gene_ids.snap @@ -20,8 +20,7 @@ query: hgnc_id: "HGNC:20324" result: - gene_symbol: TGDS - p_value: 0.1 - score: 10 + raw_score: 1.028543 terms: - term_query: term_id: "HP:0000347" @@ -417,8 +416,7 @@ result: term_name: Postnatal growth retardation score: 0 - gene_symbol: TTN - p_value: 0.3 - score: 5.2287874 + raw_score: 0.91585016 terms: - term_query: term_id: "HP:0000347" @@ -1070,4 +1068,3 @@ result: term_id: "HP:0030059" term_name: Mitochondrial depletion score: 0 - diff --git a/src/server/actix_server/hpo_sim/snapshots/viguno__server__actix_server__hpo_sim__term_gene__test__hpo_sim_term_gene_terms_symbols.snap b/src/server/actix_server/hpo_sim/snapshots/viguno__server__actix_server__hpo_sim__term_gene__test__hpo_sim_term_gene_terms_symbols.snap index 08b1dba..817282d 100644 --- a/src/server/actix_server/hpo_sim/snapshots/viguno__server__actix_server__hpo_sim__term_gene__test__hpo_sim_term_gene_terms_symbols.snap +++ b/src/server/actix_server/hpo_sim/snapshots/viguno__server__actix_server__hpo_sim__term_gene__test__hpo_sim_term_gene_terms_symbols.snap @@ -20,8 +20,7 @@ query: hgnc_id: "HGNC:20324" result: - gene_symbol: TGDS - p_value: 0.1 - score: 10 + raw_score: 1.028543 terms: - term_query: term_id: "HP:0000347" @@ -417,8 +416,7 @@ result: term_name: Postnatal growth retardation score: 0 - gene_symbol: TTN - p_value: 0.3 - score: 5.2287874 + raw_score: 0.91585016 terms: - term_query: term_id: "HP:0000347" @@ -1070,4 +1068,3 @@ result: term_id: "HP:0030059" term_name: Mitochondrial depletion score: 0 - diff --git a/src/server/actix_server/hpo_sim/term_gene.rs b/src/server/actix_server/hpo_sim/term_gene.rs index 3f3644e..57fd8a9 100644 --- a/src/server/actix_server/hpo_sim/term_gene.rs +++ b/src/server/actix_server/hpo_sim/term_gene.rs @@ -90,14 +90,8 @@ async fn handle( }?; // Perform similarity computation. - let result = query::run_query( - &query_terms, - &genes, - hpo, - data.db.as_ref().expect("must provide RocksDB"), - &data.ncbi_to_hgnc, - ) - .map_err(CustomError::new)?; + let result = query::run_query(&query_terms, &genes, hpo, &data.ncbi_to_hgnc) + .map_err(CustomError::new)?; Ok(Json(result)) } @@ -112,19 +106,12 @@ mod test { let ncbi_to_hgnc = crate::common::hgnc_xlink::load_ncbi_to_hgnc("tests/data/hgnc_xlink.tsv")?; let hgnc_to_ncbi = crate::common::hgnc_xlink::inverse_hashmap(&ncbi_to_hgnc); - let db = Some(rocksdb::DB::open_cf_for_read_only( - &rocksdb::Options::default(), - format!("{}/{}", hpo_path, "scores-fun-sim-avg-resnik-gene"), - ["meta", "scores"], - true, - )?); let hpo_doc = fastobo::from_file("tests/data/hpo/hp.obo")?; let app = actix_web::test::init_service( actix_web::App::new() .app_data(actix_web::web::Data::new(crate::server::WebServerData { ontology, - db, ncbi_to_hgnc, hgnc_to_ncbi, full_text_index: crate::index::Index::new(hpo_doc)?, diff --git a/src/server/actix_server/hpo_terms.rs b/src/server/actix_server/hpo_terms.rs index 94c3828..bd52d27 100644 --- a/src/server/actix_server/hpo_terms.rs +++ b/src/server/actix_server/hpo_terms.rs @@ -139,7 +139,7 @@ impl ResultEntry { .get_all(field_def) .filter_map(|f| f.as_text().map(std::string::ToString::to_string)) .collect::>(); - let definition = definition.first().map(std::clone::Clone::clone); + let definition = definition.first().cloned(); let synonyms = doc .get_all(field_synonym) .filter_map(|f| f.as_text().map(std::string::ToString::to_string)) @@ -330,23 +330,18 @@ async fn handle( #[cfg(test)] mod test { + use super::super::test::ws_data_default; + use std::sync::Arc; + /// Helper function for running a query. #[allow(dead_code)] - async fn run_query(uri: &str) -> Result { - let ontology = crate::common::load_hpo("tests/data/hpo")?; - let ncbi_to_hgnc = - crate::common::hgnc_xlink::load_ncbi_to_hgnc("tests/data/hgnc_xlink.tsv")?; - let hgnc_to_ncbi = crate::common::hgnc_xlink::inverse_hashmap(&ncbi_to_hgnc); - let hpo_doc = fastobo::from_file("tests/data/hpo/hp.obo")?; + async fn run_query( + ws_data: Arc, + uri: &str, + ) -> Result { let app = actix_web::test::init_service( actix_web::App::new() - .app_data(actix_web::web::Data::new(crate::server::WebServerData { - ontology, - db: None, - ncbi_to_hgnc, - hgnc_to_ncbi, - full_text_index: crate::index::Index::new(hpo_doc)?, - })) + .app_data(actix_web::web::Data::new(ws_data)) .service(super::handle), ) .await; diff --git a/src/server/mod.rs b/src/server/mod.rs index b84e7bc..828d17f 100644 --- a/src/server/mod.rs +++ b/src/server/mod.rs @@ -13,8 +13,6 @@ use crate::common::load_hpo; pub struct WebServerData { /// The HPO ontology (`hpo` crate). pub ontology: Ontology, - /// The database with precomputed Resnik P-values. - pub db: Option>, /// Xlink map from NCBI gene ID to HGNC gene ID. pub ncbi_to_hgnc: HashMap, /// Xlink map from HGNC gene ID to NCBI gene ID. @@ -126,18 +124,6 @@ pub fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), anyhow: let before_loading = std::time::Instant::now(); let ontology = load_hpo(&args.path_hpo_dir)?; tracing::info!("...done loading HPO in {:?}", before_loading.elapsed()); - tracing::info!("Opening RocksDB for reading..."); - let before_rocksdb = std::time::Instant::now(); - let db = rocksdb::DB::open_cf_for_read_only( - &rocksdb::Options::default(), - format!( - "{}/{}", - &args.path_hpo_dir, "scores-fun-sim-avg-resnik-gene" - ), - ["meta", "scores"], - true, - )?; - tracing::info!("...done opening RocksDB in {:?}", before_rocksdb.elapsed()); tracing::info!("Loading HGNC xlink..."); let before_load_xlink = std::time::Instant::now(); @@ -165,7 +151,6 @@ pub fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), anyhow: let data = actix_web::web::Data::new(WebServerData { ontology, - db: Some(db), ncbi_to_hgnc, hgnc_to_ncbi, full_text_index, diff --git a/src/simulate/mod.rs b/src/simulate/mod.rs deleted file mode 100644 index 56bbf09..0000000 --- a/src/simulate/mod.rs +++ /dev/null @@ -1,305 +0,0 @@ -//! Code for running the precomputation. - -use indicatif::ParallelProgressIterator; -use prost::Message; -use rayon::prelude::*; -use rocksdb::{DBWithThreadMode, MultiThreaded}; -use std::io::Write; -use std::time::Instant; - -use clap::Parser; -use hpo::{ - annotations::AnnotationId, - similarity::{Builtins, GroupSimilarity, StandardCombiner}, - term::{HpoGroup, InformationContentKind}, - HpoSet, HpoTermId, Ontology, -}; - -use crate::{ - common::{IcBasedOn, ScoreCombiner, SimilarityMethod}, - pbs::simulation::SimulationResults, -}; - -/// Command line arguments for Viguno. -#[derive(Parser, Debug)] -#[command(author, version, about = "Prepare values for Viguno", long_about = None)] -pub struct Args { - /// Path to the directory with the HPO files. - #[arg(long, required = true)] - pub path_hpo_dir: String, - /// Path to output RocksDB. - #[arg(long, required = true)] - pub path_out_rocksdb: String, - - /// Number of simulations to perform for each gene and term set size. - #[arg(long, default_value_t = 100_000, value_parser = clap::value_parser!(u64).range(2..))] - pub num_simulations: u64, - /// Run simulations for `min_terms..=max_terms` terms. - #[arg(long, default_value_t = 1)] - pub min_terms: usize, - /// Run simulations for `min_terms..=max_terms` terms. - #[arg(long, default_value_t = 10)] - pub max_terms: usize, - - /// What should information content be based on. - #[arg(long)] - pub ic_base: IcBasedOn, - /// The similarity method to use. - #[arg(long)] - pub similarity: SimilarityMethod, - /// The score combiner. - #[arg(long)] - pub combiner: ScoreCombiner, - - /// Optional gene ID or symbol to limit to. - #[arg(long)] - pub only_gene: Option, - /// Optional path to folder with per-gene logs. - #[arg(long)] - pub path_gene_logs: Option, - - /// Number of threads to use for simulation (default is 1 thread per core). - #[arg(long)] - pub num_threads: Option, - /// Seed for the random number generator. - #[arg(long)] - pub seed: Option, -} - -/// Run simulation using ontology and number of terms. -fn run_simulation( - db: &DBWithThreadMode, - ontology: &Ontology, - args: &Args, - num_terms: usize, -) -> Result<(), anyhow::Error> { - tracing::info!(" running simulation for {} terms ...", num_terms); - let before = Instant::now(); - - // We want at least two simulations. - let num_simulations = std::cmp::max(args.num_simulations, 2); - - // Get all HPO terms for phenotypic abnormalities. - let hpo_abnormality = ontology - .hpo(HpoTermId::from(String::from("HP:0000118"))) - .ok_or(anyhow::anyhow!( - "could not find HP:0000118 (phenotypic abnormality)" - ))?; - let term_ids = ontology - .hpos() - .filter(|t| t.child_of(&hpo_abnormality)) - .map(|t| t.id()) - .collect::>(); - - // Get all genes into a vector so we can use parallel iteration. - let genes = { - let mut genes = ontology.genes().collect::>(); - if let Some(only_gene) = args.only_gene.as_ref() { - genes.retain(|g| { - g.id().to_string().as_str().eq(only_gene.as_str()) - || g.symbol().eq(only_gene.as_str()) - }); - } - genes - }; - - // The pairwise term simliarity score to use. - let pairwise_sim = Builtins::Resnik(InformationContentKind::Gene); - // The combiner for multiple pairwise scores. - let combiner: StandardCombiner = args.combiner.into(); - // The groupwise similarity to use. - let group_sim = GroupSimilarity::new(combiner, pairwise_sim); - - // Run simulations for each gene in parallel. - genes - .par_iter() - .progress_with(crate::common::progress_bar(genes.len())) - .for_each(|gene| { - let mut log_file = if let Some(path_gene_logs) = args.path_gene_logs.as_ref() { - let path = std::path::Path::new(path_gene_logs).join(num_terms.to_string()); - std::fs::create_dir_all(&path).expect("cannot create logs directory"); - Some( - std::fs::File::create(format!("{}/{}.txt", path.display(), gene.symbol())) - .expect("could not open file"), - ) - } else { - None - }; - - // Obtain `HpoSet` from gene. - let gene_terms = HpoSet::new( - ontology, - gene.to_hpo_set(ontology) - .child_nodes() - .without_modifier() - .into_iter() - .collect::(), - ); - - // Obtain sorted list of similarity scores from simulations. - let mut scores = (0..num_simulations) - .map(|_| { - // Pick `num_terms` random terms with circuit breakers on number of tries. - let max_tries = 1000; - let sampled_terms = { - let mut tries = 0; - let mut hpo_group = HpoGroup::new(); - while hpo_group.len() < num_terms { - tries += 1; - assert!(tries <= max_tries, "tried too often to pick random terms"); - let term_id = term_ids[fastrand::usize(0..term_ids.len())]; - if !hpo_group.contains(&term_id) { - hpo_group.insert(term_id); - } - } - HpoSet::new(ontology, hpo_group) - }; - - // Compute the similarity from the sampled terms to the terms from the gene. - let res_score = group_sim.calculate(&sampled_terms, &gene_terms); - - if let Some(log_file) = log_file.as_mut() { - writeln!( - log_file, - "{}\t{}\t{}", - res_score, - gene.symbol(), - sampled_terms - .iter() - .map(|t| format!("{} ({})", t.id(), t.name())) - .collect::>() - .join(", ") - ) - .expect("could not write"); - } - - res_score - }) - .collect::>(); - - // Sort the scores ascendingly. - scores.sort_by(|a, b| a.partial_cmp(b).expect("NaN value")); - - // Copy the scores into the score distribution. - let ncbi_gene_id = gene.id().as_u32(); - let sim_res = SimulationResults { - ncbi_gene_id, - gene_symbol: gene.name().to_string(), - term_count: num_terms.try_into().expect("too many terms"), - scores, - }; - - // Encode as byte array. - let sim_res = sim_res.encode_to_vec(); - - // Write to RocksDB. - let cf_resnik = db.cf_handle("scores").unwrap(); - let key = format!("{ncbi_gene_id}:{num_terms}"); - db.put_cf(&cf_resnik, key.as_bytes(), sim_res) - .expect("writing to RocksDB failed"); - }); - tracing::info!(" ... done in {:?}", before.elapsed()); - - Ok(()) -} - -/// Main entry point for `prepare` command. -/// -/// # Errors -/// -/// In the case that there is an error in running the preparation command. -pub fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), anyhow::Error> { - tracing::info!("args_common = {:?}", &args_common); - tracing::info!("args = {:?}", &args); - - if let Some(level) = args_common.verbose.log_level() { - match level { - log::Level::Trace | log::Level::Debug => { - std::env::set_var("RUST_LOG", "debug"); - env_logger::init_from_env(env_logger::Env::new().default_filter_or("info")); - } - _ => (), - } - } - - tracing::info!("Loading HPO..."); - let before_loading = Instant::now(); - let ontology = crate::common::load_hpo(&args.path_hpo_dir)?; - tracing::info!("...done loading HPO in {:?}", before_loading.elapsed()); - - tracing::info!("Opening RocksDB for writing..."); - let before_rocksdb = Instant::now(); - let options = rocksdb_utils_lookup::tune_options(rocksdb::Options::default(), None); - let cf_names = &["meta", "scores"]; - let db = rocksdb::DB::open_cf_with_opts( - &options, - &args.path_out_rocksdb, - cf_names - .iter() - .map(|name| ((*name).to_string(), options.clone())) - .collect::>(), - )?; - // write out metadata - let cf_meta = db - .cf_handle("meta") - .ok_or(anyhow::anyhow!("column family meta not found"))?; - db.put_cf(&cf_meta, "hpo-version", ontology.hpo_version())?; - db.put_cf(&cf_meta, "app-version", crate::common::version())?; - tracing::info!("...done opening RocksDB in {:?}", before_rocksdb.elapsed()); - - tracing::info!("Running simulations..."); - let before_simulations = Instant::now(); - if let Some(seed) = args.seed { - fastrand::seed(seed); - } - if let Some(num_threds) = args.num_threads { - rayon::ThreadPoolBuilder::new() - .num_threads(num_threds) - .build_global()?; - } - for num_terms in args.min_terms..=args.max_terms { - run_simulation(&db, &ontology, args, num_terms)?; - } - tracing::info!( - "... done with simulations in {:?}", - before_simulations.elapsed() - ); - - tracing::info!("Enforcing manual compaction"); - rocksdb_utils_lookup::force_compaction_cf(&db, cf_names, Some(" "), true)?; - tracing::info!("All done. Have a nice day!"); - Ok(()) -} - -#[cfg(test)] -mod test { - use clap_verbosity_flag::Verbosity; - use temp_testdir::TempDir; - - use crate::common::{IcBasedOn, ScoreCombiner, SimilarityMethod}; - - #[test] - fn smoke_test_run() -> Result<(), anyhow::Error> { - let tmp_dir = TempDir::default(); - - let args_common = crate::common::Args { - verbose: Verbosity::new(0, 0), - }; - let args = super::Args { - path_hpo_dir: String::from("tests/data/hpo"), - path_out_rocksdb: format!("{}", tmp_dir.display()), - num_simulations: 2, - min_terms: 1, - max_terms: 10, - only_gene: Some(String::from("TGDS")), - path_gene_logs: None, - num_threads: None, - seed: Some(42), - ic_base: IcBasedOn::default(), - similarity: SimilarityMethod::default(), - combiner: ScoreCombiner::default(), - }; - - super::run(&args_common, &args) - } -} diff --git a/tests/data/hpo/bootstraph.sh b/tests/data/hpo/bootstraph.sh index 9005c35..0ef0327 100644 --- a/tests/data/hpo/bootstraph.sh +++ b/tests/data/hpo/bootstraph.sh @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:30815a44b2dcb165710fd3394e617f609ba1a06728e80b503970e190a2a5c17d -size 1063 +oid sha256:d2615ded1da39a3ca105fe4d7222271fc343a6190c0068e7abeb25c5d922072b +size 257 diff --git a/tests/data/hpo/scores-fun-sim-avg-resnik-gene/000014.sst b/tests/data/hpo/scores-fun-sim-avg-resnik-gene/000014.sst deleted file mode 100644 index 956bf75..0000000 --- a/tests/data/hpo/scores-fun-sim-avg-resnik-gene/000014.sst +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0e68b1fdc3d57f5362b0e1c0d895389cae7c874b54a9878acfb29e5d60d160ef -size 1199 diff --git a/tests/data/hpo/scores-fun-sim-avg-resnik-gene/000016.sst b/tests/data/hpo/scores-fun-sim-avg-resnik-gene/000016.sst deleted file mode 100644 index 06661ab..0000000 --- a/tests/data/hpo/scores-fun-sim-avg-resnik-gene/000016.sst +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:449b96228aa250066da9d4c58c06012e73162745172408bca34003cf99eb263b -size 2046029 diff --git a/tests/data/hpo/scores-fun-sim-avg-resnik-gene/CURRENT b/tests/data/hpo/scores-fun-sim-avg-resnik-gene/CURRENT deleted file mode 100644 index f8d5048..0000000 --- a/tests/data/hpo/scores-fun-sim-avg-resnik-gene/CURRENT +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9c283f6e81028b9eb0760d918ee4bc0aa256ed3b926393c1734c760c4bd724fd -size 16 diff --git a/tests/data/hpo/scores-fun-sim-avg-resnik-gene/IDENTITY b/tests/data/hpo/scores-fun-sim-avg-resnik-gene/IDENTITY deleted file mode 100644 index 07c7e05..0000000 --- a/tests/data/hpo/scores-fun-sim-avg-resnik-gene/IDENTITY +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:94624b0cb1126cd9f447c5fb1757286b3b6cf51a46d564957e902231644297ec -size 36 diff --git a/tests/data/hpo/scores-fun-sim-avg-resnik-gene/LOCK b/tests/data/hpo/scores-fun-sim-avg-resnik-gene/LOCK deleted file mode 100644 index e69de29..0000000 diff --git a/tests/data/hpo/scores-fun-sim-avg-resnik-gene/LOG b/tests/data/hpo/scores-fun-sim-avg-resnik-gene/LOG deleted file mode 100644 index cfac647..0000000 --- a/tests/data/hpo/scores-fun-sim-avg-resnik-gene/LOG +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f5bc0ad433799f7c24e19a30370d394cc04dacb51f6a11db3e86a7dbb04ed942 -size 61448 diff --git a/tests/data/hpo/scores-fun-sim-avg-resnik-gene/MANIFEST-000005 b/tests/data/hpo/scores-fun-sim-avg-resnik-gene/MANIFEST-000005 deleted file mode 100644 index 8669945..0000000 --- a/tests/data/hpo/scores-fun-sim-avg-resnik-gene/MANIFEST-000005 +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c501fe994a42036b98f3c6518beda6149d67dae84a924c5551c53dd3edab8df9 -size 641 diff --git a/tests/data/hpo/scores-fun-sim-avg-resnik-gene/OPTIONS-000009 b/tests/data/hpo/scores-fun-sim-avg-resnik-gene/OPTIONS-000009 deleted file mode 100644 index 18875c5..0000000 --- a/tests/data/hpo/scores-fun-sim-avg-resnik-gene/OPTIONS-000009 +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ea7c071907182339eb7f2620c6a5d07533b84c8c6937566ceb0326df884108e2 -size 15367 diff --git a/tests/data/hpo/scores-fun-sim-avg-resnik-gene/OPTIONS-000011 b/tests/data/hpo/scores-fun-sim-avg-resnik-gene/OPTIONS-000011 deleted file mode 100644 index 18875c5..0000000 --- a/tests/data/hpo/scores-fun-sim-avg-resnik-gene/OPTIONS-000011 +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ea7c071907182339eb7f2620c6a5d07533b84c8c6937566ceb0326df884108e2 -size 15367