From 081391a555da104f1bc0524bedfdfa4b2b93cfac Mon Sep 17 00:00:00 2001 From: Nordine Bittich Date: Mon, 25 Nov 2024 00:22:43 +0100 Subject: [PATCH] IRI proper implementation: ipv4 & ipv6 --- examples/turtle_doc/input/0028.ttl | 24 ++++ examples/turtle_doc/output/0028.ttl | 7 + src/iri.rs | 190 ++++++++++++++++++++++++++++ src/lib.rs | 4 +- src/tests/mod.rs | 2 +- src/tests/turtle_doc_test.rs | 1 + 6 files changed, 225 insertions(+), 3 deletions(-) create mode 100644 examples/turtle_doc/input/0028.ttl create mode 100644 examples/turtle_doc/output/0028.ttl create mode 100644 src/iri.rs diff --git a/examples/turtle_doc/input/0028.ttl b/examples/turtle_doc/input/0028.ttl new file mode 100644 index 0000000..f92bded --- /dev/null +++ b/examples/turtle_doc/input/0028.ttl @@ -0,0 +1,24 @@ +# A triple with all absolute IRIs + . + +@base . + . # relative IRIs, e.g. http://one.example/subject2 + +BASE + . # relative IRIs, e.g. http://one.example/subject2 + +@prefix p: . +p:subject3 p:predicate3 p:object3 . # prefixed name, e.g. http://two.example/subject3 + +PREFIX p: +p:subject3 p:predicate3 p:object3 . # prefixed name, e.g. http://two.example/subject3 + +@prefix p: . # prefix p: now stands for http://one.example/path/ +p:subject4 p:predicate4 p:object4 . # prefixed name, e.g. http://one.example/path/subject4 + +@prefix : . # empty prefix +:subject5 :predicate5 :object5 . # prefixed name, e.g. http://another.example/subject5 + +:subject6 a :subject7 . # same as :subject6 :subject7 . + + a :subject8 . # a multi-script subject IRI . diff --git a/examples/turtle_doc/output/0028.ttl b/examples/turtle_doc/output/0028.ttl new file mode 100644 index 0000000..a100141 --- /dev/null +++ b/examples/turtle_doc/output/0028.ttl @@ -0,0 +1,7 @@ + . + . + . + . + . + . + . diff --git a/src/iri.rs b/src/iri.rs new file mode 100644 index 0000000..aaedbfb --- /dev/null +++ b/src/iri.rs @@ -0,0 +1,190 @@ +#![allow(unused)] + +use std::{collections::VecDeque, ops::RangeBounds}; + +use chrono::ParseResult; +use nom::{ + bytes::complete::take_while_m_n, + character::complete::one_of, + combinator::{success, verify}, + error::{ParseError, VerboseError}, + multi::{many1, many_m_n}, +}; + +use crate::prelude::*; + +pub enum Segment { + Hextet(u16), + Compressed, + IpV4(Vec), +} +fn parse_ip_v6(s: &str) -> ParserResult> { + fn hex_to_u16(input: &str) -> Result { + u16::from_str_radix(input, 16) + } + fn recognize_hexadecimal(input: &str) -> ParserResult<&str> { + recognize(take_while_m_n(1, 4, |c: char| c.is_ascii_hexdigit()))(input) + } + fn hextet(s: &str) -> ParserResult { + map_res(recognize_hexadecimal, hex_to_u16)(s) + }; + fn segment(s: &str) -> ParserResult { + alt(( + map(tag("::"), |_| Segment::Compressed), + preceded(tag(":"), map(parse_ip_v4, Segment::IpV4)), + preceded(opt(tag(":")), map(hextet, Segment::Hextet)), + ))(s) + } + let mut ipv6: Vec = vec![]; + let (rest, list) = verify(many_m_n(1, 8, segment), |l: &[Segment]| { + l.iter() + .filter(|seg| matches!(seg, Segment::Compressed)) + .count() + <= 1 + && l.iter() + .filter(|seg| matches!(seg, Segment::IpV4(_))) + .count() + <= 1 + })(s)?; + + let mut compression_pos = None; + for (idx, segment) in list.into_iter().enumerate() { + match segment { + Segment::Hextet(v) => ipv6.push(v), + Segment::Compressed => { + compression_pos = Some(idx); + } + Segment::IpV4(l) => { + ipv6.push((l[0] as u16) << 8 | l[1] as u16); + ipv6.push((l[2] as u16) << 8 | l[3] as u16); + } + } + } + if let Some(idx) = compression_pos { + let len = ipv6.len(); + while ipv6.len() < 8 { + ipv6.insert(idx, 0x0); + } + } + + Ok((rest, ipv6)) +} +fn parse_ip_v4(s: &str) -> ParserResult> { + verify( + separated_list1( + tag("."), + verify( + map_parser(take_while1(|c: char| c.is_numeric()), all_consuming(U8)), + |num: &u8| num <= &255, + ), + ), + |list: &[u8]| list.len() == 4, + )(s) +} + +#[cfg(test)] +mod test { + use crate::iri::{parse_ip_v4, parse_ip_v6}; + + #[test] + fn parse_ip_v4_test() { + assert_eq!( + parse_ip_v4("192.168.0.1").unwrap(), + ("", [192, 168, 0, 1].to_vec()) + ); + assert_eq!( + parse_ip_v4("127.0.0.1").unwrap(), + ("", [127, 0, 0, 1].to_vec()) + ); + assert_eq!(parse_ip_v4("8.8.8.8").unwrap(), ("", [8, 8, 8, 8].to_vec())); + assert_eq!( + parse_ip_v4("255.255.255.255").unwrap(), + ("", [255, 255, 255, 255].to_vec()) + ); + assert!(parse_ip_v4("256.1.1.1").is_err()); + assert!(parse_ip_v4("192.168.0").is_err()); + assert!(parse_ip_v4("192.168..1").is_err()); + } + + #[test] + fn parse_ip_v6_test() { + assert_eq!( + parse_ip_v6("2001:0db8:85a3:0000:0000:8a2e:0370:7334").unwrap(), + ( + "", + [0x2001, 0x0db8, 0x85a3, 0, 0, 0x8a2e, 0x370, 0x7334].into() + ) + ); + assert_eq!( + parse_ip_v6("2001:0db8:0000:0000:0000:0000:0000:0001").unwrap(), + ("", [0x2001, 0x0db8, 0, 0, 0, 0, 0, 1].into()) + ); + + assert_eq!( + parse_ip_v6("2001:0db8:0000:0000:0000:ff00:0042:8329").unwrap(), + ("", [0x2001, 0x0db8, 0, 0, 0, 0xff00, 0x42, 0x8329].into()) + ); + + assert_eq!( + parse_ip_v6("2001:db8:0:0:0:ff00:42:8329").unwrap(), + ("", [0x2001, 0x0db8, 0, 0, 0, 0xff00, 0x42, 0x8329].into()) + ); + + assert!(parse_ip_v6("2001:db8::::ff00:42:8329").is_err()); + assert_eq!( + parse_ip_v6("::ffff:192.0.2.128").unwrap(), + ("", [0, 0, 0, 0, 0, 0xffff, 0xc000, 0x280].into()) + ); + let test_cases = [ + ( + "2001:0db8:85a3:0000:0000:8a2e:0370:7334", + vec![ + 0x2001, 0x0db8, 0x85a3, 0x0000, 0x0000, 0x8a2e, 0x0370, 0x7334, + ], + ), + ( + "2001:db8:85a3::8a2e:370:7334", + vec![ + 0x2001, 0xdb8, 0x85a3, 0x0000, 0x0000, 0x8a2e, 0x0370, 0x7334, + ], + ), + ( + "2001:db8:85a3:0:0:8a2e:0370:7334", + vec![ + 0x2001, 0xdb8, 0x85a3, 0x0000, 0x0000, 0x8a2e, 0x0370, 0x7334, + ], + ), + ( + "2001:db8::370:7334", + vec![0x2001, 0xdb8, 0x0, 0x0, 0x0, 0x0, 0x370, 0x7334], + ), + ( + "2001:0db8:0000:0000:0000:ff00:0042:8329", + vec![0x2001, 0x0db8, 0x0, 0x0, 0x0, 0xff00, 0x42, 0x8329], + ), + ( + "fe80::1ff:fe23:4567:890a", + vec![0xfe80, 0x0, 0x0, 0x0, 0x1ff, 0xfe23, 0x4567, 0x890a], + ), + ( + "0:0:0:0:0:0:0:0", + vec![0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0], + ), + ( + "0:0:0:0:0:0:0:1", + vec![0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1], + ), + ( + "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff", + vec![ + 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, + ], + ), + ]; + + for (addr, expected) in test_cases.into_iter() { + let result = parse_ip_v6(addr).unwrap(); + assert_eq!(result, ("", expected),); + } + } +} diff --git a/src/lib.rs b/src/lib.rs index eaf960e..d201498 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,8 +1,8 @@ +pub mod iri; mod shared; mod string_parser; mod triple_common_parser; pub mod turtle; - pub mod prelude { use nom::error::VerboseError; pub use nom::{ @@ -14,7 +14,7 @@ pub mod prelude { character::{ complete::{ alphanumeric1, char, i64 as I64, line_ending, multispace0, multispace1, space0, - space1, u32 as U32, + space1, u16 as U16, u32 as U32, u8 as U8, }, is_alphanumeric, is_space, }, diff --git a/src/tests/mod.rs b/src/tests/mod.rs index 278996f..b18546b 100644 --- a/src/tests/mod.rs +++ b/src/tests/mod.rs @@ -80,13 +80,13 @@ fn cmp_input_file( .replace(" ", ""), ); } - assert_eq!(input.len(), output.len()); if !diff.is_empty() { println!("========== Differences =========="); println!("{diff}"); println!("========== Differences =========="); } assert_eq!(diff.len(), 0); + assert_eq!(input.len(), output.len()); } } mod triple_common_parser_test_misc; diff --git a/src/tests/turtle_doc_test.rs b/src/tests/turtle_doc_test.rs index 5455633..ba334d8 100644 --- a/src/tests/turtle_doc_test.rs +++ b/src/tests/turtle_doc_test.rs @@ -32,6 +32,7 @@ const INPUT_DIR: &str = "examples/turtle_doc"; #[test_case("0025", None , false ; "EQ: test date 20/09/2012")] #[test_case("0026", None , false ; "EQ: test date 2023-08-30T10:31:00.080Z")] #[test_case("0027", None , true ; "JSON: test simple json result with bnode")] +// #[test_case("0028", None , false ; "The following Turtle document contains examples of all the different ways of writing IRIs in Turtle.")] #[serial] fn test_turtle_doc(test_name: &str, diff_file: Option<&str>, output_json: bool) { reset_fake_uuid_gen();