From adc54f3d03d0a6042a80f1b8584e0882376bff61 Mon Sep 17 00:00:00 2001 From: Manuel Holtgrewe Date: Wed, 9 Oct 2024 14:19:10 +0200 Subject: [PATCH] feat: annotate inheritance via HPO --- src/seqvars/query/annonars.rs | 22 +- src/seqvars/query/hpo.rs | 315 ++++++++++++++++++ src/seqvars/query/mod.rs | 11 +- ...hpo__test__hgnc_to_xlink_load_entries.snap | 24 ++ ...o__test__load_hgnc_to_inheritance_map.snap | 15 + ...test__phenotype_to_genes_load_entries.snap | 24 ++ src/strucvars/aggregate/cli.rs | 2 +- tests/strucvars/query/db/hpo/hgnc_xlink.tsv | 3 + .../query/db/hpo/phenotype_to_genes.txt | 3 + 9 files changed, 414 insertions(+), 5 deletions(-) create mode 100644 src/seqvars/query/hpo.rs create mode 100644 src/seqvars/query/snapshots/varfish_server_worker__seqvars__query__hpo__test__hgnc_to_xlink_load_entries.snap create mode 100644 src/seqvars/query/snapshots/varfish_server_worker__seqvars__query__hpo__test__load_hgnc_to_inheritance_map.snap create mode 100644 src/seqvars/query/snapshots/varfish_server_worker__seqvars__query__hpo__test__phenotype_to_genes_load_entries.snap create mode 100644 tests/strucvars/query/db/hpo/hgnc_xlink.tsv create mode 100644 tests/strucvars/query/db/hpo/phenotype_to_genes.txt diff --git a/src/seqvars/query/annonars.rs b/src/seqvars/query/annonars.rs index bf38f283..6c7a7279 100644 --- a/src/seqvars/query/annonars.rs +++ b/src/seqvars/query/annonars.rs @@ -6,11 +6,14 @@ use crate::{common::GenomeRelease, seqvars::ingest::path_component}; use prost::Message as _; -use super::schema::data::VariantRecord; +use super::{ + hpo::{load_hgnc_to_inheritance_map, HgncToMoiMap}, + schema::data::VariantRecord, +}; /// Bundle the types needed for databases. pub struct AnnonarsDbs { - /// annonaars gene RocksDB. + /// annonars gene RocksDB. pub genes_db: Arc>, /// ClinVar database as annonars RocksDB. pub clinvar_db: Arc>, @@ -124,6 +127,8 @@ impl AnnonarsDbs { pub struct Annotator { /// Annonars database bundles. pub annonars_dbs: AnnonarsDbs, + /// Mapping from HGNC gene ID to modes of inheritance; from `hpo` directory. + pub hgnc_to_moi: HgncToMoiMap, } impl Annotator { @@ -143,7 +148,18 @@ impl Annotator { e ) })?; - Ok(Self { annonars_dbs }) + let hgnc_to_moi = + load_hgnc_to_inheritance_map(&path.as_ref().join("hpo")).map_err(|e| { + anyhow::anyhow!( + "problem loading HGNC to mode of inheritance map at {}: {}", + path.as_ref().join("hpo").display(), + e + ) + })?; + Ok(Self { + annonars_dbs, + hgnc_to_moi, + }) } /// Query `genes` database for a given HGNC ID. diff --git a/src/seqvars/query/hpo.rs b/src/seqvars/query/hpo.rs new file mode 100644 index 00000000..6f8a0b4c --- /dev/null +++ b/src/seqvars/query/hpo.rs @@ -0,0 +1,315 @@ +//! Code for accessing HPO-related information. + +use crate::pbs::varfish::v1::seqvars::output as pbs_output; + +/// Enumeration for modes of inheritance. +#[derive( + serde::Serialize, + serde::Deserialize, + enum_map::Enum, + PartialEq, + Eq, + PartialOrd, + Ord, + Hash, + Clone, + Copy, + Debug, + strum::EnumString, + strum::Display, +)] +pub enum ModeOfInheritance { + /// Autosomal dominant inheritance (HP:0000006). + AutosomalDominant, + /// Autosomal recessive inheritance (HP:0000007). + AutosomalRecessive, + /// X-linked dominant inheritance (HP:0001419). + XLinkedDominant, + /// X-linked recessive inheritance (HP:0001423). + XLinkedRecessive, + /// Y-linked inheritance (HP:0001450). + YLinked, + /// Mitochondrial inheritance (HP:0001427). + Mitochondrial, +} + +impl ModeOfInheritance { + /// Allow parsing of `ModeOfInheritance` from HPO ID. + pub fn from_hpo_id(hpo_id: &str) -> Option { + match hpo_id { + "HP:0000006" | "HP:0012275" => Some(Self::AutosomalDominant), + "HP:0000007" => Some(Self::AutosomalRecessive), + "HP:0001419" => Some(Self::XLinkedDominant), + "HP:0001423" => Some(Self::XLinkedRecessive), + "HP:0001450" => Some(Self::YLinked), + "HP:0001427" => Some(Self::Mitochondrial), + _ => None, + } + } +} + +impl From for pbs_output::ModeOfInheritance { + fn from(val: ModeOfInheritance) -> Self { + match val { + ModeOfInheritance::AutosomalDominant => { + pbs_output::ModeOfInheritance::AutosomalDominant + } + ModeOfInheritance::AutosomalRecessive => { + pbs_output::ModeOfInheritance::AutosomalRecessive + } + ModeOfInheritance::XLinkedDominant => pbs_output::ModeOfInheritance::XLinkedDominant, + ModeOfInheritance::XLinkedRecessive => pbs_output::ModeOfInheritance::XLinkedRecessive, + ModeOfInheritance::YLinked => pbs_output::ModeOfInheritance::YLinked, + ModeOfInheritance::Mitochondrial => pbs_output::ModeOfInheritance::Mitochondrial, + } + } +} + +pub type HgncToMoiMap = indexmap::IndexMap>; + +/// Load the the `phenotype_to_genes.tsv` and `hgnc_xlink.tsv` files from the `hpo` +/// directory and build a map from HGNC gene ID to set of `ModeOfInheritance` +/// values. +pub fn load_hgnc_to_inheritance_map>( + path: &P, +) -> Result { + let phenotypes_to_genes = + phenotype_to_genes::load_entries(&path.as_ref().join("phenotype_to_genes.txt")) + .map_err(|e| anyhow::anyhow!("error loading phenotype_to_genes.txt: {}", e))?; + let ncbi_to_hgnc = hgnc_xlink::load_ncbi_to_hgnc(path.as_ref().join("hgnc_xlink.tsv")) + .map_err(|e| anyhow::anyhow!("error loading hgnc_xlink.tsv: {}", e))?; + + let mut result = indexmap::IndexMap::new(); + + for ptg_entry in phenotypes_to_genes { + if let Some(ncbi_gene_id) = ptg_entry.ncbi_gene_id { + if let Some(hgnc_ids) = ncbi_to_hgnc.get(&ncbi_gene_id) { + for hgnc_id in hgnc_ids { + if let Some(mode_of_inheritance) = + ModeOfInheritance::from_hpo_id(&ptg_entry.hpo_id) + { + let modes_of_inheritance = result + .entry(hgnc_id.clone()) + .or_insert_with(indexmap::IndexSet::new); + modes_of_inheritance.insert_sorted(mode_of_inheritance); + } + } + } + } + } + + Ok(result) +} + +/// Code for accessing the `phenotype_to_genes.tsv` file. +pub(super) mod phenotype_to_genes { + /// Data structure for representing an entry of the table. + #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] + #[serde_with::skip_serializing_none] + pub struct Entry { + /// Entrez gene ID. + #[serde(alias = "entrez_id")] + pub ncbi_gene_id: Option, + /// Gene symbol. + pub gene_symbol: String, + /// HPO ID. + pub hpo_id: String, + // HPO Name. + pub hpo_name: String, + } + + /// Read the `phenotype_to_genes.tsv` file using the `csv` crate via serde. + /// + /// # Errors + /// + /// In the case that the file could not be read. + pub fn load_entries>(path: &P) -> Result, anyhow::Error> { + let mut rdr = csv::ReaderBuilder::new() + .delimiter(b'\t') + .has_headers(true) + .from_path(path.as_ref())?; + let mut entries = Vec::new(); + for result in rdr.deserialize() { + let entry: Entry = result?; + entries.push(entry); + } + Ok(entries) + } +} + +/// Code for accessing the `hgnc_xlink.tsv` file. +pub(super) mod hgnc_xlink { + use std::collections::HashMap; + + /// Data structure for representing an entry of the table. + #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] + #[serde_with::skip_serializing_none] + pub struct Entry { + /// HGNC gene ID. + pub hgnc_id: String, + /// Ensembl gene ID. + pub ensembl_gene_id: Option, + /// Entrez gene ID. + #[serde(alias = "entrez_id")] + pub ncbi_gene_id: Option, + /// Gene symbol. + pub gene_symbol: String, + } + + /// Read the `hgnc_xlink.tsv` file using the `csv` crate via serde. + /// + /// # Errors + /// + /// In the case that the file could not be read. + pub fn load_entries>(path: &P) -> Result, anyhow::Error> { + let mut rdr = csv::ReaderBuilder::new() + .delimiter(b'\t') + .has_headers(true) + .from_path(path.as_ref())?; + let mut entries = Vec::new(); + for result in rdr.deserialize() { + let entry: Entry = result?; + entries.push(entry); + } + Ok(entries) + } + + /// Read the `hgnc_xlink.tsv` into a map from NCBI gene ID to HGNC gene ID. + /// + /// # Errors + /// + /// In the case that the file could not be read. + pub fn load_ncbi_to_hgnc>( + path: P, + ) -> Result>, anyhow::Error> { + let mut map = HashMap::new(); + for entry in load_entries(&path)? { + if let Some(ncbi_gene_id) = entry.ncbi_gene_id { + let hgnc_id = entry.hgnc_id; + let entry = map.entry(ncbi_gene_id).or_insert_with(Vec::new); + entry.push(hgnc_id); + } + } + Ok(map) + } +} + +#[cfg(test)] +mod test { + #[test] + pub fn test_mode_of_inheritance_from_hpo_id() { + assert_eq!( + super::ModeOfInheritance::from_hpo_id("HP:0000006"), + Some(super::ModeOfInheritance::AutosomalDominant) + ); + assert_eq!( + super::ModeOfInheritance::from_hpo_id("HP:0012275"), + Some(super::ModeOfInheritance::AutosomalDominant) + ); + assert_eq!( + super::ModeOfInheritance::from_hpo_id("HP:0000007"), + Some(super::ModeOfInheritance::AutosomalRecessive) + ); + assert_eq!( + super::ModeOfInheritance::from_hpo_id("HP:0001419"), + Some(super::ModeOfInheritance::XLinkedDominant) + ); + assert_eq!( + super::ModeOfInheritance::from_hpo_id("HP:0001423"), + Some(super::ModeOfInheritance::XLinkedRecessive) + ); + assert_eq!( + super::ModeOfInheritance::from_hpo_id("HP:0001450"), + Some(super::ModeOfInheritance::YLinked) + ); + assert_eq!( + super::ModeOfInheritance::from_hpo_id("HP:0001427"), + Some(super::ModeOfInheritance::Mitochondrial) + ); + + assert_eq!(super::ModeOfInheritance::from_hpo_id("HP:0000000"), None); + } + + #[test] + pub fn test_mode_of_inheritance_into_pbs() { + assert_eq!( + Into::::into( + super::ModeOfInheritance::AutosomalDominant + ), + super::pbs_output::ModeOfInheritance::AutosomalDominant + ); + assert_eq!( + Into::::into( + super::ModeOfInheritance::AutosomalRecessive + ), + super::pbs_output::ModeOfInheritance::AutosomalRecessive + ); + assert_eq!( + Into::::into( + super::ModeOfInheritance::XLinkedDominant + ), + super::pbs_output::ModeOfInheritance::XLinkedDominant + ); + assert_eq!( + Into::::into( + super::ModeOfInheritance::XLinkedRecessive + ), + super::pbs_output::ModeOfInheritance::XLinkedRecessive + ); + assert_eq!( + Into::::into(super::ModeOfInheritance::YLinked), + super::pbs_output::ModeOfInheritance::YLinked + ); + assert_eq!( + Into::::into( + super::ModeOfInheritance::Mitochondrial + ), + super::pbs_output::ModeOfInheritance::Mitochondrial + ); + } + + #[test] + fn load_hgnc_to_inheritance_map() -> Result<(), anyhow::Error> { + let path = std::path::Path::new("tests/seqvars/query/db/hpo"); + let map = super::load_hgnc_to_inheritance_map(&path)?; + + assert_eq!(map.len(), 4599); + insta::assert_yaml_snapshot!(&map[0..5]); + + Ok(()) + } + + #[test] + fn test_phenotype_to_genes_load_entries() -> Result<(), anyhow::Error> { + let path = std::path::Path::new("tests/seqvars/query/db/hpo/phenotype_to_genes.txt"); + let entries = super::phenotype_to_genes::load_entries(&path)?; + + assert_eq!(entries.len(), 988720); + insta::assert_yaml_snapshot!(&entries[0..5]); + + Ok(()) + } + + #[test] + fn test_hgnc_to_xlink_load_entries() -> Result<(), anyhow::Error> { + let path = std::path::Path::new("tests/seqvars/query/db/hpo/hgnc_xlink.tsv"); + let map = super::hgnc_xlink::load_entries(&path)?; + + assert_eq!(map.len(), 43839); + insta::assert_yaml_snapshot!(&map[0..5]); + + Ok(()) + } + + #[test] + fn test_load_ncbi_to_hgnc() -> Result<(), anyhow::Error> { + let path = std::path::Path::new("tests/seqvars/query/db/hpo/hgnc_xlink.tsv"); + let map = super::hgnc_xlink::load_ncbi_to_hgnc(&path)?; + + assert_eq!(map.len(), 43792); + assert_eq!(map.get(&1), Some(&vec!["HGNC:5".to_string()])); + assert_eq!(map.get(&2), Some(&vec!["HGNC:7".to_string()])); + + Ok(()) + } +} diff --git a/src/seqvars/query/mod.rs b/src/seqvars/query/mod.rs index 3c7cbe36..01c2daa6 100644 --- a/src/seqvars/query/mod.rs +++ b/src/seqvars/query/mod.rs @@ -1,6 +1,7 @@ //! Code implementing the "seqvars query" sub command. pub mod annonars; +pub mod hpo; pub mod interpreter; pub mod schema; pub mod sorting; @@ -712,6 +713,7 @@ impl WithSeqvarAndAnnotator for pbs_output::GeneRelatedAnnotation { let gene_record = annotator .query_genes(&hgnc_id) .map_err(|e| anyhow::anyhow!("problem querying genes database: {}", e))?; + let mois = annotator.hgnc_to_moi.get(&hgnc_id); return Ok(Self { identity: Some(pbs_output::GeneIdentity { @@ -719,7 +721,7 @@ impl WithSeqvarAndAnnotator for pbs_output::GeneRelatedAnnotation { gene_symbol: ann.gene_symbol.clone(), }), consequences: gene_related_annotation::consequences(ann)?, - phenotypes: gene_related_annotation::phenotypes(&gene_record), + phenotypes: gene_related_annotation::phenotypes(&gene_record, mois), constraints: gene_related_annotation::constraints(&gene_record)?, }); } @@ -756,12 +758,19 @@ mod gene_related_annotation { pub(crate) fn phenotypes( gene_record: &Option<::annonars::pbs::genes::base::Record>, + mois: Option<&indexmap::IndexSet>, ) -> Option { gene_record .as_ref() .map(|gene_record| pbs_output::GeneRelatedPhenotypes { is_acmg_sf: gene_record.acmg_sf.is_some(), is_disease_gene: gene_record.omim.is_some() || gene_record.orpha.is_some(), + modes_of_inheritance: mois + .cloned() + .unwrap_or_default() + .into_iter() + .map(|moi| Into::::into(moi) as i32) + .collect::>(), }) } diff --git a/src/seqvars/query/snapshots/varfish_server_worker__seqvars__query__hpo__test__hgnc_to_xlink_load_entries.snap b/src/seqvars/query/snapshots/varfish_server_worker__seqvars__query__hpo__test__hgnc_to_xlink_load_entries.snap new file mode 100644 index 00000000..070e3ee6 --- /dev/null +++ b/src/seqvars/query/snapshots/varfish_server_worker__seqvars__query__hpo__test__hgnc_to_xlink_load_entries.snap @@ -0,0 +1,24 @@ +--- +source: src/seqvars/query/hpo.rs +expression: "&map[0..5]" +--- +- hgnc_id: "HGNC:5" + ensembl_gene_id: ENSG00000121410 + ncbi_gene_id: 1 + gene_symbol: A1BG +- hgnc_id: "HGNC:37133" + ensembl_gene_id: ENSG00000268895 + ncbi_gene_id: 503538 + gene_symbol: A1BG-AS1 +- hgnc_id: "HGNC:24086" + ensembl_gene_id: ENSG00000148584 + ncbi_gene_id: 29974 + gene_symbol: A1CF +- hgnc_id: "HGNC:7" + ensembl_gene_id: ENSG00000175899 + ncbi_gene_id: 2 + gene_symbol: A2M +- hgnc_id: "HGNC:27057" + ensembl_gene_id: ENSG00000245105 + ncbi_gene_id: 144571 + gene_symbol: A2M-AS1 diff --git a/src/seqvars/query/snapshots/varfish_server_worker__seqvars__query__hpo__test__load_hgnc_to_inheritance_map.snap b/src/seqvars/query/snapshots/varfish_server_worker__seqvars__query__hpo__test__load_hgnc_to_inheritance_map.snap new file mode 100644 index 00000000..6fa47edf --- /dev/null +++ b/src/seqvars/query/snapshots/varfish_server_worker__seqvars__query__hpo__test__load_hgnc_to_inheritance_map.snap @@ -0,0 +1,15 @@ +--- +source: src/seqvars/query/hpo.rs +expression: "&map[0..5]" +--- +- - "HGNC:7114" + - - AutosomalDominant +- - "HGNC:19309" + - - AutosomalDominant +- - "HGNC:16391" + - - AutosomalRecessive +- - "HGNC:1966" + - - AutosomalDominant + - AutosomalRecessive +- - "HGNC:4241" + - - AutosomalRecessive diff --git a/src/seqvars/query/snapshots/varfish_server_worker__seqvars__query__hpo__test__phenotype_to_genes_load_entries.snap b/src/seqvars/query/snapshots/varfish_server_worker__seqvars__query__hpo__test__phenotype_to_genes_load_entries.snap new file mode 100644 index 00000000..9a2036c7 --- /dev/null +++ b/src/seqvars/query/snapshots/varfish_server_worker__seqvars__query__hpo__test__phenotype_to_genes_load_entries.snap @@ -0,0 +1,24 @@ +--- +source: src/seqvars/query/hpo.rs +expression: "&entries[0..5]" +--- +- ncbi_gene_id: 3081 + gene_symbol: HGD + hpo_id: "HP:0008775" + hpo_name: Abnormal prostate morphology +- ncbi_gene_id: 695 + gene_symbol: BTK + hpo_id: "HP:0008775" + hpo_name: Abnormal prostate morphology +- ncbi_gene_id: 695 + gene_symbol: BTK + hpo_id: "HP:0008775" + hpo_name: Abnormal prostate morphology +- ncbi_gene_id: 1493 + gene_symbol: CTLA4 + hpo_id: "HP:0008775" + hpo_name: Abnormal prostate morphology +- ncbi_gene_id: 26191 + gene_symbol: PTPN22 + hpo_id: "HP:0008775" + hpo_name: Abnormal prostate morphology diff --git a/src/strucvars/aggregate/cli.rs b/src/strucvars/aggregate/cli.rs index 9e73347c..a2b32eec 100644 --- a/src/strucvars/aggregate/cli.rs +++ b/src/strucvars/aggregate/cli.rs @@ -84,7 +84,7 @@ async fn split_input_by_chrom_and_sv_type( .expect("no file for chrom/sv_type"); #[allow(clippy::needless_borrows_for_generic_args)] to_writer(&mut tmp_file, &input_record)?; - tmp_file.write_all(&[b'\n'])?; + tmp_file.write_all(b"\n")?; // Write out progress indicator every 60 seconds. if prev.elapsed().as_secs() >= 60 { diff --git a/tests/strucvars/query/db/hpo/hgnc_xlink.tsv b/tests/strucvars/query/db/hpo/hgnc_xlink.tsv new file mode 100644 index 00000000..e6ff6ff1 --- /dev/null +++ b/tests/strucvars/query/db/hpo/hgnc_xlink.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36fea776c169438bf476b25b21a2dbf5a3ae0451aa6aef2126bd9aac8fc54282 +size 1811518 diff --git a/tests/strucvars/query/db/hpo/phenotype_to_genes.txt b/tests/strucvars/query/db/hpo/phenotype_to_genes.txt new file mode 100644 index 00000000..3e886831 --- /dev/null +++ b/tests/strucvars/query/db/hpo/phenotype_to_genes.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41aef588fbcdeaa3c36b236827a384f9b2f8add720fe9cdf2dd9a7120791643e +size 49631290