From 58415dec1694ad13649d64f52f972c4f4450307e Mon Sep 17 00:00:00 2001 From: Arnaud Mimart <33665250+amimart@users.noreply.github.com> Date: Thu, 14 Dec 2023 14:14:33 +0100 Subject: [PATCH] feat(rdf): add basic error management --- Cargo.lock | 1 + packages/okp4-rdf/Cargo.toml | 1 + packages/okp4-rdf/src/normalize.rs | 108 +++++++++++++++++++---------- 3 files changed, 74 insertions(+), 36 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0aab8e9e..e900a60f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -848,6 +848,7 @@ dependencies = [ "rio_turtle", "rio_xml", "sha2 0.10.8", + "thiserror", ] [[package]] diff --git a/packages/okp4-rdf/Cargo.toml b/packages/okp4-rdf/Cargo.toml index 7e128847..0c44b4aa 100644 --- a/packages/okp4-rdf/Cargo.toml +++ b/packages/okp4-rdf/Cargo.toml @@ -11,3 +11,4 @@ rio_api.workspace = true rio_turtle.workspace = true rio_xml.workspace = true sha2 = "0.10.8" +thiserror.workspace = true diff --git a/packages/okp4-rdf/src/normalize.rs b/packages/okp4-rdf/src/normalize.rs index be11e388..90c2e735 100644 --- a/packages/okp4-rdf/src/normalize.rs +++ b/packages/okp4-rdf/src/normalize.rs @@ -4,6 +4,7 @@ use sha2::Digest; use std::collections::hash_map::Entry; use std::collections::HashMap; use std::ops::Index; +use thiserror::Error; /// A RDF normalizer allowing to canonicalize RDF data, following the https://www.w3.org/TR/rdf-canon specification. #[derive(Eq, PartialEq, Debug)] @@ -14,6 +15,13 @@ pub struct Normalizer<'a> { canonical_issuer: IdentifierIssuer, } +#[derive(Error, Debug, Eq, PartialEq)] +pub enum NormalizationError { + /// An unexpected error denotes an error that should never occur. + #[error("An unexpected error occurred: {0}")] + Unexpected(String), +} + impl<'a> Normalizer<'a> { const CANONICAL_BLANK_NODES_IDENTIFIER_PREFIX: &'static str = "c14n"; const TEMPORARY_BLANK_NODES_IDENTIFIER_PREFIX: &'static str = "b"; @@ -36,33 +44,48 @@ impl<'a> Normalizer<'a> { } } - pub fn normalize(&'a mut self, dataset: &[Quad<'a>]) -> String { + pub fn normalize(&'a mut self, dataset: &[Quad<'a>]) -> Result { self.reset(); self.track_blank_nodes(dataset); self.compute_first_degree_hashes(); - self.label_unique_nodes(); - self.compute_n_degree_hashes(); + self.label_unique_nodes()?; + self.compute_n_degree_hashes()?; let mut canonicalized_dataset = dataset.to_vec(); for quad in canonicalized_dataset.iter_mut() { if let Subject::BlankNode(n) = quad.subject { quad.subject = Subject::BlankNode(BlankNode { - id: self.canonical_issuer.get(n.id).unwrap(), + id: self.canonical_issuer.get(n.id).ok_or_else(|| { + NormalizationError::Unexpected( + "Could not replace subject blank node, canonical identifier not found" + .to_string(), + ) + })?, }); } if let Term::BlankNode(n) = quad.object { quad.object = Term::BlankNode(BlankNode { - id: self.canonical_issuer.get(n.id).unwrap(), + id: self.canonical_issuer.get(n.id).ok_or_else(|| { + NormalizationError::Unexpected( + "Could not replace object blank node, canonical identifier not found" + .to_string(), + ) + })?, }); } if let Some(GraphName::BlankNode(n)) = quad.graph_name { quad.graph_name = Some(GraphName::BlankNode(BlankNode { - id: self.canonical_issuer.get(n.id).unwrap(), + id: self.canonical_issuer.get(n.id).ok_or_else(|| { + NormalizationError::Unexpected( + "Could not replace graph blank node, canonical identifier not found" + .to_string(), + ) + })?, })); } } - Self::serialize(&canonicalized_dataset) + Ok(Self::serialize(&canonicalized_dataset)) } fn reset(&mut self) { @@ -105,9 +128,8 @@ impl<'a> Normalizer<'a> { } } - fn label_unique_nodes(&mut self) { + fn label_unique_nodes(&mut self) -> Result<(), NormalizationError> { let mut sorted_hash = Vec::with_capacity(self.hash_to_blank_nodes.len()); - for hash in self.hash_to_blank_nodes.iter().filter_map(|(key, nodes)| { if nodes.len() > 1 { return None; @@ -122,14 +144,20 @@ impl<'a> Normalizer<'a> { self.canonical_issuer.get_or_issue( self.hash_to_blank_nodes .remove(&hash) - .unwrap() + .ok_or_else(|| { + NormalizationError::Unexpected( + "Could not label unique node, hash not found".to_string(), + ) + })? .index(0) .clone(), ); } + + Ok(()) } - fn compute_n_degree_hashes(&mut self) { + fn compute_n_degree_hashes(&mut self) -> Result<(), NormalizationError> { let mut sorted_first_degree_hashes: Vec = Vec::with_capacity(self.hash_to_blank_nodes.len()); sorted_first_degree_hashes.extend(self.hash_to_blank_nodes.keys().cloned()); @@ -154,7 +182,7 @@ impl<'a> Normalizer<'a> { ); scoped_issuer.get_or_issue(node.clone()); - let (n_degree_hash, _) = self.compute_n_degree_hash(&mut scoped_issuer, node); + let (n_degree_hash, _) = self.compute_n_degree_hash(&mut scoped_issuer, node)?; hash_to_node.insert(n_degree_hash.clone(), node.clone()); sorted_n_degree_hashes.push(n_degree_hash); } @@ -166,18 +194,23 @@ impl<'a> Normalizer<'a> { } } } + + Ok(()) } fn compute_n_degree_hash( &mut self, scoped_issuer: &mut IdentifierIssuer, node: &String, - ) -> (String, IdentifierIssuer) { + ) -> Result<(String, IdentifierIssuer), NormalizationError> { let mut hashes: HashMap> = HashMap::new(); - // TODO: manage an error if quads not found instead.. - for quad in self.blank_node_to_quads.get(node).unwrap() { - [ + for quad in self.blank_node_to_quads.get(node).ok_or_else(|| { + NormalizationError::Unexpected( + "Could not compute n degree hash, quads for node not found".to_string(), + ) + })? { + for (related, position) in [ match quad.subject { Subject::BlankNode(BlankNode { id }) if id != node => { Some((id, Self::HASH_RELATED_BLANK_NODE_POSITION_S)) @@ -199,15 +232,15 @@ impl<'a> Normalizer<'a> { ] .iter() .flatten() - .for_each(|(related, position)| { + { let hash = - self.compute_related_blank_node_hash(quad, scoped_issuer, related, position); + self.compute_related_blank_node_hash(quad, scoped_issuer, related, position)?; hashes .entry(hash) .and_modify(|v| v.push(related.to_string())) .or_insert(vec![related.to_string()]); - }); + } } let mut sorted_hashes: Vec<&String> = Vec::with_capacity(hashes.len()); @@ -244,7 +277,7 @@ impl<'a> Normalizer<'a> { } for related in recursion_list { - let (result, mut issuer) = self.compute_n_degree_hash(&mut issuer, &related); + let (result, mut issuer) = self.compute_n_degree_hash(&mut issuer, &related)?; path.push_str("_:"); path.push_str(issuer.get_or_issue(related).as_str()); path.push('<'); @@ -268,10 +301,10 @@ impl<'a> Normalizer<'a> { hasher.update(chosen_path.as_str()); } - ( + Ok(( base16ct::lower::encode_string(&hasher.finalize()), chosen_issuer, - ) + )) } fn compute_related_blank_node_hash( @@ -280,7 +313,7 @@ impl<'a> Normalizer<'a> { scoped_issuer: &mut IdentifierIssuer, node: &str, position: &str, - ) -> String { + ) -> Result { let mut hasher = sha2::Sha256::new(); hasher.update(position); if position != Self::HASH_RELATED_BLANK_NODE_POSITION_G { @@ -291,18 +324,19 @@ impl<'a> Normalizer<'a> { hasher.update("_:"); - // TODO: consider to manage the case the node doesn't exists in blank_node_to_hash map and output - // an error. This cannot occur as every blank nodes has a computed first degree hash.. - if let Some(hash) = self - .canonical_issuer - .get(node) - .or_else(|| scoped_issuer.get(node)) - .or_else(|| self.blank_node_to_hash.get(node)) - { - hasher.update(hash); - } - - base16ct::lower::encode_string(&hasher.finalize()) + hasher.update( + self.canonical_issuer + .get(node) + .or_else(|| scoped_issuer.get(node)) + .or_else(|| self.blank_node_to_hash.get(node)) + .ok_or_else(|| { + NormalizationError::Unexpected( + "Could not compute related node hash, node not found".to_string(), + ) + })?, + ); + + Ok(base16ct::lower::encode_string(&hasher.finalize())) } fn serialize(quads: &[Quad<'_>]) -> String { @@ -577,7 +611,9 @@ mod test { for case in cases { let mut normalizer = Normalizer::new(); - assert_eq!(normalizer.normalize(&case.0), case.1); + let res = normalizer.normalize(&case.0); + assert!(res.is_ok()); + assert_eq!(res.unwrap(), case.1); } }