From e1b193ea26df662424294e23a32931755cea3384 Mon Sep 17 00:00:00 2001 From: Jonas Marcello Date: Tue, 11 Jun 2024 07:36:44 +0200 Subject: [PATCH] Fixed merge request coming from parser refactoring on multiple branches --- src/annotations/omim_disease.rs | 11 ++-- src/ontology.rs | 109 ++++---------------------------- src/parser.rs | 64 ++++++++++++------- 3 files changed, 60 insertions(+), 124 deletions(-) diff --git a/src/annotations/omim_disease.rs b/src/annotations/omim_disease.rs index 138151c..1eec5a6 100644 --- a/src/annotations/omim_disease.rs +++ b/src/annotations/omim_disease.rs @@ -6,8 +6,7 @@ use std::hash::Hash; use crate::annotations::disease::DiseaseIterator; use crate::annotations::{AnnotationId, Disease}; use crate::term::HpoGroup; -use crate::HpoError; -use crate::HpoTermId; +use crate::{HpoError, HpoSet, HpoTermId, Ontology}; /// A set of OMIM diseases /// @@ -114,7 +113,7 @@ impl Disease for OmimDisease { /// # Examples /// /// ``` - /// use hpo::annotations::OmimDisease; + /// use hpo::annotations::{Disease, OmimDisease}; /// /// let mut disease = OmimDisease::new(123.into(), "FooBar"); /// let bytes = disease.as_bytes(); @@ -123,7 +122,7 @@ impl Disease for OmimDisease { /// assert_eq!(bytes[4..8], [0u8, 0u8, 0u8, 123u8]); // ID of disease => 123 /// assert_eq!(bytes[8..12], [0u8, 0u8, 0u8, 6u8]); // Length of Name => 6 /// ``` - pub fn as_bytes(&self) -> Vec { + fn as_bytes(&self) -> Vec { fn usize_to_u32(n: usize) -> u32 { n.try_into().expect("unable to convert {n} to u32") } @@ -157,7 +156,7 @@ impl Disease for OmimDisease { } /// Returns an [`HpoSet`] from the `OmimDisease` - pub fn to_hpo_set<'a>(&self, ontology: &'a Ontology) -> HpoSet<'a> { + fn to_hpo_set<'a>(&self, ontology: &'a Ontology) -> HpoSet<'a> { HpoSet::new(ontology, self.hpos.clone()) } @@ -167,7 +166,7 @@ impl Disease for OmimDisease { /// /// This method does **not** add the [`OmimDisease`] to the [HPO term](`crate::HpoTerm`). /// Clients should not use this method, unless they are creating their own Ontology. - pub fn add_term>(&mut self, term_id: I) -> bool { + fn add_term>(&mut self, term_id: I) -> bool { self.hpos.insert(term_id) } } diff --git a/src/ontology.rs b/src/ontology.rs index dce474d..f891a95 100644 --- a/src/ontology.rs +++ b/src/ontology.rs @@ -611,87 +611,6 @@ impl Ontology { } } - /// Returns a binary representation of the Ontology - /// - /// The binary data is separated into sections: - /// - /// - Metadata (HPO and Bindat Version) (see `Ontology::metadata_as_bytes`) - /// - Terms (Names + IDs) (see `HpoTermInternal::as_bytes`) - /// - Term - Parent connection (Child ID - Parent ID) - /// (see `HpoTermInternal::parents_as_byte`) - /// - Genes (Names + IDs + Connected HPO Terms) ([`Gene::as_bytes`]) - /// - OMIM Diseases (Names + IDs + Connected HPO Terms) - /// ([`OmimDisease::as_bytes`]) - /// - /// Every section starts with 4 bytes to indicate its size - /// (big-endian encoded `u32`) - /// - /// This method is only useful if you use are modifying the ontology - /// and want to save data for later re-use. - /// - /// # Panics - /// - /// Panics when the buffer length of any subsegment larger than `u32::MAX` - /// - /// # Examples - /// - /// ``` - /// use hpo::Ontology; - /// let ontology = Ontology::from_binary("tests/example.hpo").unwrap(); - /// let bytes = ontology.as_bytes(); - /// ``` - pub fn as_bytes(&self) -> Vec { - fn usize_to_u32(n: usize) -> u32 { - n.try_into().expect("unable to convert {n} to u32") - } - let mut res = Vec::new(); - - // Add metadata, version info - res.append(&mut self.metadata_as_bytes()); - - // All HPO Terms - let mut buffer = Vec::new(); - for term in self.hpo_terms.values() { - buffer.append(&mut term.as_bytes()); - } - res.append(&mut usize_to_u32(buffer.len()).to_be_bytes().to_vec()); - res.append(&mut buffer); - - // All Term - Parent connections - buffer.clear(); - for term in self.hpo_terms.values() { - buffer.append(&mut term.parents_as_byte()); - } - res.append(&mut usize_to_u32(buffer.len()).to_be_bytes().to_vec()); - res.append(&mut buffer); - - // Genes and Gene-Term connections - buffer.clear(); - for gene in self.genes.values() { - buffer.append(&mut gene.as_bytes()); - } - res.append(&mut usize_to_u32(buffer.len()).to_be_bytes().to_vec()); - res.append(&mut buffer); - - // OMIM Disease and Disease-Term connections - buffer.clear(); - for omim_disease in self.omim_diseases.values() { - buffer.append(&mut omim_disease.as_bytes()); - } - res.append(&mut usize_to_u32(buffer.len()).to_be_bytes().to_vec()); - res.append(&mut buffer); - - // ORPHA Disease and Disease-Term connections - buffer.clear(); - for orpha_disease in self.orpha_diseases.values() { - buffer.append(&mut orpha_disease.as_bytes()); - } - res.append(&mut usize_to_u32(buffer.len()).to_be_bytes().to_vec()); - res.append(&mut buffer); - - res - } - /// Returns the number of HPO-Terms in the Ontology /// /// # Examples @@ -718,16 +637,6 @@ impl Ontology { self.len() == 0 } - /// Returns the Jax-Ontology release version - /// - /// e.g. `2023-03-13` - pub fn hpo_version(&self) -> String { - format!( - "{:0>4}-{:0>2}-{:0>2}", - self.hpo_version.0, self.hpo_version.1, self.hpo_version.2, - ) - } - /// Returns the [`HpoTerm`] of the provided [`HpoTermId`] /// /// If no such term is present in the Ontolgy, `None` is returned @@ -1087,6 +996,14 @@ impl Ontology { res.append(&mut usize_to_u32(buffer.len()).to_be_bytes().to_vec()); res.append(&mut buffer); + // ORPHA Disease and Disease-Term connections + buffer.clear(); + for orpha_disease in self.orpha_diseases.values() { + buffer.append(&mut orpha_disease.as_bytes()); + } + res.append(&mut usize_to_u32(buffer.len()).to_be_bytes().to_vec()); + res.append(&mut buffer); + res } @@ -1165,16 +1082,16 @@ impl Ontology { if (gene.hpo_terms() & &phenotype_ids).is_empty() { continue; } - let gene_id = ont.add_gene( + ont.add_gene( self.gene(gene.id()).ok_or(HpoError::DoesNotExist)?.name(), - &gene.id().as_u32().to_string() - )?; + *gene.id(), + ); // Link the gene to every term in the new ontology // --> also modifier terms for term in &(gene.hpo_terms() & &ids) { - ont.link_gene_term(term, gene_id)?; - ont.gene_mut(&gene_id) + ont.link_gene_term(term, *gene.id())?; + ont.gene_mut(gene.id()) .ok_or(HpoError::DoesNotExist)? .add_term(term); } diff --git a/src/parser.rs b/src/parser.rs index 82821fe..0b90a4a 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -366,35 +366,44 @@ pub(crate) mod disease_to_hpo { fn parse_line(line: &str) -> HpoResult>> { if line.starts_with("OMIM") { - parse_disease_components(line).map(DiseaseKind::Omim) + Ok(parse_disease_components(line)?.map(DiseaseKind::Omim)) } else if line.starts_with("ORPHA") { - parse_disease_components(line).map(DiseaseKind::Orpha) + Ok(parse_disease_components(line)?.map(DiseaseKind::Orpha)) } else { Ok(None) } } - fn parse_disease_components(line: &str) -> HpoResul>> { - let cols: Vec<&str> = line.trim().split('\t').collect(); - if cols[2] == "NOT" { - return Ok(None); - } + fn parse_disease_components(line: &str) -> HpoResult>> { + let mut cols = line.trim().splitn(5, '\t'); + + let Some(id_col) = cols.next() else { + return Err(HpoError::InvalidInput(line.to_string())); + }; - let Some((_, omim_id)) = cols[0].split_once(':') else { - error!("cannot parse Disease ID from {}", cols[0]); + let Some((_, disease_id)) = id_col.split_once(':') else { return Err(HpoError::InvalidInput(line.to_string())); }; - - let Ok(hpo_id) = HpoTermId::try_from(cols[3]) else { - error!("invalid HPO ID: {}", cols[3]); + + let Some(disease_name) = cols.next() else { return Err(HpoError::InvalidInput(line.to_string())); }; - + + if let Some("NOT") = cols.next() { + return Ok(None); + }; + + let hpo_id = if let Some(id) = cols.next() { + HpoTermId::try_from(id)? + } else { + return Err(HpoError::InvalidInput(line.to_string())); + }; + Ok(Some(DiseaseComponents { - id: omim_id, - name: omim_name, + id: disease_id, + name: disease_name, hpo_id, - }))) + })) } /// Quick and dirty parser for development and debugging @@ -456,11 +465,18 @@ pub(crate) mod disease_to_hpo { } #[test] - fn test_skip_orpha() { + fn test_correct_orpha() { let s = "ORPHA:600171\tGonadal agenesis\t\tHP:0000055\tOMIM:600171\tTAS\tP\tHPO:skoehler[2014-11-27]"; - assert!(parse_line(s) + let orpha = parse_line(s) .expect("This line has the correct format") - .is_none()); + .expect("Line describes an Omim disease"); + if let DiseaseKind::Orpha(orpha) = orpha { + assert_eq!(orpha.name, "Gonadal agenesis"); + assert_eq!(orpha.id, "600171"); + assert_eq!(orpha.hpo_id, "HP:0000055"); + } else { + panic!("Orpha line should be parsed as Orpha correctly"); + } } #[test] @@ -477,9 +493,13 @@ pub(crate) mod disease_to_hpo { let omim = parse_line(s) .expect("This line has the correct format") .expect("Line describes an Omim disease"); - assert_eq!(omim.name, "Gonadal agenesis"); - assert_eq!(omim.id, "600171"); - assert_eq!(omim.hpo_id, "HP:0000055"); + if let DiseaseKind::Omim(omim) = omim { + assert_eq!(omim.name, "Gonadal agenesis"); + assert_eq!(omim.id, "600171"); + assert_eq!(omim.hpo_id, "HP:0000055"); + } else { + panic!("Omim line should be parsed as Omim correctly"); + } } #[test]