Skip to content

Commit

Permalink
feat: hgnc_xlink.tsv is expected now in hpo folder (#170) (#171)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe authored Jul 11, 2024
1 parent b89fc26 commit 5c19d95
Show file tree
Hide file tree
Showing 57 changed files with 1,928 additions and 996 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ utoipa-swagger-ui = { version = "7.1.0", features = ["actix-web"] }
utoipa = { version = "4.2", features = ["actix_extras", "chrono", "indexmap", "preserve_order", "yaml"] }

[build-dependencies]
anyhow = "1.0"
prost-build = "0.12"

[dev-dependencies]
Expand Down
60 changes: 29 additions & 31 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,29 @@ We fix ourselves to the release from 2023-06-06.
```
# RELEASE=2023-06-06
# URL=https://github.com/obophenotype/human-phenotype-ontology/releases/download
# NAMES="hp.obo phenotype.hpoa phenotype_to_genes.txt genes_to_phenotype.txt"
# NAMES="hp-base.obo phenotype.hpoa phenotype_to_genes.txt genes_to_phenotype.txt"
# mkdir -p /tmp/data/hpo
# for name in $NAMES; do \
wget \
-O /tmp/data/hpo/$name \
$URL/v$RELEASE/$name;
done
# mv /tmp/data/hpo/hp-base.obo /tmp/data/hpo/hp.obo
# sed -i -e 's|/hp-base.owl||' /tmp/data/hpo/hp.obo
```

Next, generate the cross-link file between different gene identifiers.

```
# wget -O /tmp/hgnc_complete_set.json \
https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/json/hgnc_complete_set.json
# echo -e "hgnc_id\tensembl_gene_id\tentrez_id\tgene_symbol" \
> /tmp/data/hpo/hgnc_xlink.tsv
# jq -r '.response.docs[] | select(.entrez_id != null) | [.hgnc_id, .ensembl_gene_id, .entrez_id, .symbol] | @tsv' \
/tmp/hgnc_complete_set.json \
| LC_ALL=C sort -t $'\t' -k3,3n \
>> /tmp/data/hpo/hgnc_xlink.tsv
```

You can now conver the downloaded text HPO files to a binary format which will improve performance of loading data.
Expand All @@ -57,46 +72,29 @@ You can now conver the downloaded text HPO files to a binary format which will i
After having the precomputed data, you can startup the server as follows:

```
# viguno run-server \
# viguno server run \
--path-hpo-dir tests/data/hpo
INFO args_common = Args { verbose: Verbosity { verbose: 0, quiet: 0, phantom: PhantomData<clap_verbosity_flag::InfoLevel> } }
INFO args = Args { path_hpo_dir: "tests/data/hpo", suppress_hints: false, listen_host: "127.0.0.1", listen_port: 8080 }
INFO Loading HPO...
INFO ...done loading HPO in 8.180012599s
INFO Opening RocksDB for reading...
INFO ...done opening RocksDB in 19.027133ms
INFO attempting to load binary HPO file from tests/data/hpo
INFO ...done loading HPO in 4.788750172s
INFO Loading HGNC xlink...
INFO ... done loading HGNC xlink in 156.362034ms
INFO Loading HPO OBO...
INFO ... done loading HPO OBO in 1.90213703s
INFO Indexing OBO...
INFO ... done indexing OBO in 835.558794ms
INFO Launching server main on http://127.0.0.1:8080 ...
INFO try: http://127.0.0.1:8080/hpo/genes?gene_symbol=TGDS
INFO try: http://127.0.0.1:8080/hpo/genes?gene_id=23483&hpo_terms=true
INFO try: http://127.0.0.1:8080/hpo/omims?omim_id=616145&hpo_terms=true
INFO try: http://127.0.0.1:8080/hpo/terms?term_id=HP:0000023&genes=true
INFO try: http://127.0.0.1:8080/hpo/sim/term-term?lhs=HP:0001166,HP:0040069&rhs=HP:0005918,HP:0004188
INFO try: http://127.0.0.1:8080/hpo/sim/term-gene?terms=HP:0001166,HP:0000098&gene_symbols=FBN1,TGDS,TTN
INFO starting 4 workers
INFO SEE SWAGGER UI FOR INTERACTIVE DOCS: http://127.0.0.1:8080/swagger-ui/
INFO starting 8 workers
INFO Actix runtime found; starting in Actix runtime
```

Now the server is running and you could stop it with `Ctrl-C`.

In another terminal, you then now do as suggested above.
Note that we truncate the output JSON.

```
# curl 'http://127.0.0.1:8080/hpo/genes?gene_symbol=TGDS'
[{"gene_ncbi_id":23483,"gene_symbol":"TGDS"}]
# curl 'http://127.0.0.1:8080/hpo/genes?gene_id=23483&hpo_terms=true'
[{"gene_ncbi_id":23483,"gene_symbol":"TGDS","hpo_terms":[{"term_...
# curl 'http://127.0.0.1:8080/hpo/omims?omim_id=616145&hpo_terms=true'
[{"omim_id":"OMIM:616145","name":"Catel-Manzke syndrome","hpo_te...
# curl 'http://127.0.0.1:8080/hpo/terms?term_id=HP:0000023&genes=true'
[{"term_id":"HP:0000023","name":"Inguinal hernia","genes":[{"gen...
# curl 'http://127.0.0.1:8080/hpo/sim/term-term?lhs=HP:0001166,HP:0040069&rhs=HP:0005918,HP:0004188'
[{"lhs":"HP:0001166","rhs":"HP:0005918","score":1.4280319,"sim":...
```
You can go to http://127.0.0.1/swagger-ui to see the automatically generated interactive API documentation.
You can find the OpenAPI YAML file for the `main` branch [here on GitHub](https://raw.githubusercontent.com/varfish-org/viguno/main/openapi.yaml) and e.g., open it [here in the public Swagger editor](https://editor.swagger.io?url=https://raw.githubusercontent.com/varfish-org/viguno/main/openapi.yaml).

# Developer Documentation

Expand Down
25 changes: 20 additions & 5 deletions build.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,28 @@
// The custom build script, needed as we use protocolbuffers.
use std::{env, path::PathBuf};

fn main() -> Result<(), anyhow::Error> {
let root = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("protos");
let proto_files = ["viguno/v1/simulation.proto"]
.iter()
.map(|f| root.join(f))
.collect::<Vec<_>>();

// Tell cargo to recompile if any of these proto files are changed
for proto_file in &proto_files {
println!("cargo:rerun-if-changed={}", proto_file.display());
}

let descriptor_path = PathBuf::from(env::var("OUT_DIR").unwrap()).join("proto_descriptor.bin");

fn main() {
prost_build::Config::new()
.protoc_arg("-Isrc/proto")
// Save descriptors to file
.file_descriptor_set_path(&descriptor_path)
// Add serde serialization and deserialization to the generated code.
.type_attribute(".", "#[derive(serde::Serialize, serde::Deserialize)]")
// Skip serializing `None` values.
.type_attribute(".", "#[serde_with::skip_serializing_none]")
// Define the protobuf files to compile.
.compile_protos(&["viguno/v1/simulation.proto"], &["src/"])
.unwrap();
.compile_protos(&proto_files, &[root])?;

Ok(())
}
101 changes: 98 additions & 3 deletions openapi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ paths:
tags:
- hpo_genes
summary: Query for genes in the HPO database.
operationId: handle
operationId: hpo_genes
parameters:
- name: gene_id
in: query
Expand Down Expand Up @@ -63,7 +63,7 @@ paths:
tags:
- hpo_omims
summary: Query for OMIM diseases in the HPO database.
operationId: handle
operationId: hpo_omims
parameters:
- name: omim_id
in: query
Expand Down Expand Up @@ -107,6 +107,101 @@ paths:
application/json:
schema:
$ref: '#/components/schemas/Result'
/hpo/sim/term-gene:
get:
tags:
- hpo_sim::term_gene
summary: Query for similarity between a set of terms to each entry in a
description: list of genes.
operationId: hpo_sim_term_gene
parameters:
- name: terms
in: query
description: Set of terms to use as query.
required: true
schema:
type: array
items:
type: string
- name: gene_ids
in: query
description: The set of ids for genes to use as "database".
required: false
schema:
type: array
items:
type: string
nullable: true
- name: gene_symbols
in: query
description: The set of symbols for genes to use as "database".
required: false
schema:
type: array
items:
type: string
nullable: true
responses:
'200':
description: The query was successful.
content:
application/json:
schema:
$ref: '#/components/schemas/Result'
/hpo/sim/term-term:
get:
tags:
- hpo_sim::term_term
summary: Query for pairwise term similarity.
description: |-
In the case of Resnik, this corresponds to `IC(MICA(t_1, t_2))`.
# Errors
In the case that there is an error running the server.
operationId: hpo_sim_term_term
parameters:
- name: lhs
in: query
description: The one set of HPO terms to compute similarity for.
required: true
schema:
type: array
items:
type: string
- name: rhs
in: query
description: The second set of HPO terms to compute similarity for.
required: true
schema:
type: array
items:
type: string
- name: ic_base
in: query
description: What should information content be based on.
required: false
schema:
$ref: '#/components/schemas/IcBasedOn'
- name: similarity
in: query
description: The similarity method to use.
required: false
schema:
$ref: '#/components/schemas/SimilarityMethod'
- name: combiner
in: query
description: The score combiner.
required: false
schema:
$ref: '#/components/schemas/ScoreCombiner'
responses:
'200':
description: The query was successful.
content:
application/json:
schema:
$ref: '#/components/schemas/Result'
/hpo/terms:
get:
tags:
Expand All @@ -116,7 +211,7 @@ paths:
# Errors
In the case that there is an error running the server.
operationId: handle
operationId: hpo_terms
parameters:
- name: term_id
in: query
Expand Down
File renamed without changes.
2 changes: 1 addition & 1 deletion src/algos/phenomizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ mod test {

let score = score(&prepare(query), &hpo_marfan, &hpo);

assert!((score - 1.770_859_7).abs() < 0.00001, "score = {score}");
assert!((score - 1.757_194).abs() < 0.00001, "score = {score}");

Ok(())
}
Expand Down
6 changes: 2 additions & 4 deletions src/query/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,6 @@ pub struct Args {
/// Path to the directory with the HPO files.
#[arg(long, required = true)]
pub path_hpo_dir: String,
/// Path to the TSV file with the HGNC xlink data.
#[arg(long, required = true)]
pub path_hgnc_xlink: String,

/// Path to JSON file with the genes to rank.
#[arg(long)]
Expand Down Expand Up @@ -318,7 +315,8 @@ pub fn run(args_common: &crate::common::Args, args: &Args) -> Result<(), anyhow:

tracing::info!("Loading HGNC xlink...");
let before_load_xlink = Instant::now();
let ncbi_to_hgnc = crate::common::hgnc_xlink::load_ncbi_to_hgnc(&args.path_hgnc_xlink)?;
let path_hgnc_xlink = format!("{}/hgnc_xlink.tsv", args.path_hpo_dir);
let ncbi_to_hgnc = crate::common::hgnc_xlink::load_ncbi_to_hgnc(path_hgnc_xlink)?;
tracing::info!(
"... done loading HGNC xlink in {:?}",
before_load_xlink.elapsed()
Expand Down
3 changes: 2 additions & 1 deletion src/server/run/hpo_genes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ pub struct Result {
/// Query for genes in the HPO database.
#[allow(clippy::unused_async)]
#[utoipa::path(
operation_id = "hpo_genes",
params(Query),
responses(
(status = 200, description = "The query was successful.", body = Result),
Expand Down Expand Up @@ -215,7 +216,7 @@ pub(crate) mod test {
pub fn web_server_data() -> Arc<crate::server::run::WebServerData> {
let ontology = crate::common::load_hpo("tests/data/hpo").expect("could not load HPO");
let ncbi_to_hgnc =
crate::common::hgnc_xlink::load_ncbi_to_hgnc("tests/data/hgnc_xlink.tsv")
crate::common::hgnc_xlink::load_ncbi_to_hgnc("tests/data/hpo/hgnc_xlink.tsv")
.expect("could not HGNC xlink");
let hgnc_to_ncbi = crate::common::hgnc_xlink::inverse_hashmap(&ncbi_to_hgnc);
let hpo_doc = fastobo::from_file("tests/data/hpo/hp.obo").expect("could not load HPO OBO");
Expand Down
1 change: 1 addition & 0 deletions src/server/run/hpo_omims.rs
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ pub struct Result {
/// Query for OMIM diseases in the HPO database.
#[allow(clippy::unused_async)]
#[utoipa::path(
operation_id = "hpo_omims",
params(Query),
responses(
(status = 200, description = "The query was successful.", body = Result),
Expand Down
Loading

0 comments on commit 5c19d95

Please sign in to comment.