From 081391a555da104f1bc0524bedfdfa4b2b93cfac Mon Sep 17 00:00:00 2001
From: Nordine Bittich <contact@bittich.be>
Date: Mon, 25 Nov 2024 00:22:43 +0100
Subject: [PATCH] IRI proper implementation: ipv4 & ipv6

---
 examples/turtle_doc/input/0028.ttl  |  24 ++++
 examples/turtle_doc/output/0028.ttl |   7 +
 src/iri.rs                          | 190 ++++++++++++++++++++++++++++
 src/lib.rs                          |   4 +-
 src/tests/mod.rs                    |   2 +-
 src/tests/turtle_doc_test.rs        |   1 +
 6 files changed, 225 insertions(+), 3 deletions(-)
 create mode 100644 examples/turtle_doc/input/0028.ttl
 create mode 100644 examples/turtle_doc/output/0028.ttl
 create mode 100644 src/iri.rs
diff --git a/examples/turtle_doc/input/0028.ttl b/examples/turtle_doc/input/0028.ttl
new file mode 100644
index 0000000..f92bded
--- /dev/null
+++ b/examples/turtle_doc/input/0028.ttl
@@ -0,0 +1,24 @@
+# A triple with all absolute IRIs
+<http://one.example/subject1> <http://one.example/predicate1> <http://one.example/object1> .
+
+@base <http://one.example/> .
+<subject2> <predicate2> <object2> .     # relative IRIs, e.g. http://one.example/subject2
+
+BASE <http://one.example/>
+<subject2> <predicate2> <object2> .     # relative IRIs, e.g. http://one.example/subject2
+
+@prefix p: <http://two.example/> .
+p:subject3 p:predicate3 p:object3 .     # prefixed name, e.g. http://two.example/subject3
+
+PREFIX p: <http://two.example/>
+p:subject3 p:predicate3 p:object3 .     # prefixed name, e.g. http://two.example/subject3
+
+@prefix p: <path/> .                    # prefix p: now stands for http://one.example/path/
+p:subject4 p:predicate4 p:object4 .     # prefixed name, e.g. http://one.example/path/subject4
+
+@prefix : <http://another.example/> .    # empty prefix
+:subject5 :predicate5 :object5 .        # prefixed name, e.g. http://another.example/subject5
+
+:subject6 a :subject7 .                 # same as :subject6 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> :subject7 .
+
+<http://伝言.example/?user=أكرم&amp;channel=R%26D> a :subject8 . # a multi-script subject IRI .
diff --git a/examples/turtle_doc/output/0028.ttl b/examples/turtle_doc/output/0028.ttl
new file mode 100644
index 0000000..a100141
--- /dev/null
+++ b/examples/turtle_doc/output/0028.ttl
@@ -0,0 +1,7 @@
+      <http://two.example/subject3> <http://two.example/predicate3> <http://two.example/object3> .
+<http://another.example/subject5> <http://another.example/predicate5> <http://another.example/object5> .
+<http://one.example/path/subject4> <http://one.example/path/predicate4> <http://one.example/path/object4> .
+<http://one.example/subject2> <http://one.example/predicate2> <http://one.example/object2> .
+<http://another.example/subject6> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://another.example/subject7> .
+<http://one.example/subject1> <http://one.example/predicate1> <http://one.example/object1> .
+<http://伝言.example/?user=أكرم&amp;channel=R%26D> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://another.example/subject8> .
diff --git a/src/iri.rs b/src/iri.rs
new file mode 100644
index 0000000..aaedbfb
--- /dev/null
+++ b/src/iri.rs
@@ -0,0 +1,190 @@
+#![allow(unused)]
+
+use std::{collections::VecDeque, ops::RangeBounds};
+
+use chrono::ParseResult;
+use nom::{
+    bytes::complete::take_while_m_n,
+    character::complete::one_of,
+    combinator::{success, verify},
+    error::{ParseError, VerboseError},
+    multi::{many1, many_m_n},
+};
+
+use crate::prelude::*;
+
+pub enum Segment {
+    Hextet(u16),
+    Compressed,
+    IpV4(Vec<u8>),
+}
+fn parse_ip_v6(s: &str) -> ParserResult<Vec<u16>> {
+    fn hex_to_u16(input: &str) -> Result<u16, std::num::ParseIntError> {
+        u16::from_str_radix(input, 16)
+    }
+    fn recognize_hexadecimal(input: &str) -> ParserResult<&str> {
+        recognize(take_while_m_n(1, 4, |c: char| c.is_ascii_hexdigit()))(input)
+    }
+    fn hextet(s: &str) -> ParserResult<u16> {
+        map_res(recognize_hexadecimal, hex_to_u16)(s)
+    };
+    fn segment(s: &str) -> ParserResult<Segment> {
+        alt((
+            map(tag("::"), |_| Segment::Compressed),
+            preceded(tag(":"), map(parse_ip_v4, Segment::IpV4)),
+            preceded(opt(tag(":")), map(hextet, Segment::Hextet)),
+        ))(s)
+    }
+    let mut ipv6: Vec<u16> = vec![];
+    let (rest, list) = verify(many_m_n(1, 8, segment), |l: &[Segment]| {
+        l.iter()
+            .filter(|seg| matches!(seg, Segment::Compressed))
+            .count()
+            <= 1
+            && l.iter()
+                .filter(|seg| matches!(seg, Segment::IpV4(_)))
+                .count()
+                <= 1
+    })(s)?;
+
+    let mut compression_pos = None;
+    for (idx, segment) in list.into_iter().enumerate() {
+        match segment {
+            Segment::Hextet(v) => ipv6.push(v),
+            Segment::Compressed => {
+                compression_pos = Some(idx);
+            }
+            Segment::IpV4(l) => {
+                ipv6.push((l[0] as u16) << 8 | l[1] as u16);
+                ipv6.push((l[2] as u16) << 8 | l[3] as u16);
+            }
+        }
+    }
+    if let Some(idx) = compression_pos {
+        let len = ipv6.len();
+        while ipv6.len() < 8 {
+            ipv6.insert(idx, 0x0);
+        }
+    }
+
+    Ok((rest, ipv6))
+}
+fn parse_ip_v4(s: &str) -> ParserResult<Vec<u8>> {
+    verify(
+        separated_list1(
+            tag("."),
+            verify(
+                map_parser(take_while1(|c: char| c.is_numeric()), all_consuming(U8)),
+                |num: &u8| num <= &255,
+            ),
+        ),
+        |list: &[u8]| list.len() == 4,
+    )(s)
+}
+
+#[cfg(test)]
+mod test {
+    use crate::iri::{parse_ip_v4, parse_ip_v6};
+
+    #[test]
+    fn parse_ip_v4_test() {
+        assert_eq!(
+            parse_ip_v4("192.168.0.1").unwrap(),
+            ("", [192, 168, 0, 1].to_vec())
+        );
+        assert_eq!(
+            parse_ip_v4("127.0.0.1").unwrap(),
+            ("", [127, 0, 0, 1].to_vec())
+        );
+        assert_eq!(parse_ip_v4("8.8.8.8").unwrap(), ("", [8, 8, 8, 8].to_vec()));
+        assert_eq!(
+            parse_ip_v4("255.255.255.255").unwrap(),
+            ("", [255, 255, 255, 255].to_vec())
+        );
+        assert!(parse_ip_v4("256.1.1.1").is_err());
+        assert!(parse_ip_v4("192.168.0").is_err());
+        assert!(parse_ip_v4("192.168..1").is_err());
+    }
+
+    #[test]
+    fn parse_ip_v6_test() {
+        assert_eq!(
+            parse_ip_v6("2001:0db8:85a3:0000:0000:8a2e:0370:7334").unwrap(),
+            (
+                "",
+                [0x2001, 0x0db8, 0x85a3, 0, 0, 0x8a2e, 0x370, 0x7334].into()
+            )
+        );
+        assert_eq!(
+            parse_ip_v6("2001:0db8:0000:0000:0000:0000:0000:0001").unwrap(),
+            ("", [0x2001, 0x0db8, 0, 0, 0, 0, 0, 1].into())
+        );
+
+        assert_eq!(
+            parse_ip_v6("2001:0db8:0000:0000:0000:ff00:0042:8329").unwrap(),
+            ("", [0x2001, 0x0db8, 0, 0, 0, 0xff00, 0x42, 0x8329].into())
+        );
+
+        assert_eq!(
+            parse_ip_v6("2001:db8:0:0:0:ff00:42:8329").unwrap(),
+            ("", [0x2001, 0x0db8, 0, 0, 0, 0xff00, 0x42, 0x8329].into())
+        );
+
+        assert!(parse_ip_v6("2001:db8::::ff00:42:8329").is_err());
+        assert_eq!(
+            parse_ip_v6("::ffff:192.0.2.128").unwrap(),
+            ("", [0, 0, 0, 0, 0, 0xffff, 0xc000, 0x280].into())
+        );
+        let test_cases = [
+            (
+                "2001:0db8:85a3:0000:0000:8a2e:0370:7334",
+                vec![
+                    0x2001, 0x0db8, 0x85a3, 0x0000, 0x0000, 0x8a2e, 0x0370, 0x7334,
+                ],
+            ),
+            (
+                "2001:db8:85a3::8a2e:370:7334",
+                vec![
+                    0x2001, 0xdb8, 0x85a3, 0x0000, 0x0000, 0x8a2e, 0x0370, 0x7334,
+                ],
+            ),
+            (
+                "2001:db8:85a3:0:0:8a2e:0370:7334",
+                vec![
+                    0x2001, 0xdb8, 0x85a3, 0x0000, 0x0000, 0x8a2e, 0x0370, 0x7334,
+                ],
+            ),
+            (
+                "2001:db8::370:7334",
+                vec![0x2001, 0xdb8, 0x0, 0x0, 0x0, 0x0, 0x370, 0x7334],
+            ),
+            (
+                "2001:0db8:0000:0000:0000:ff00:0042:8329",
+                vec![0x2001, 0x0db8, 0x0, 0x0, 0x0, 0xff00, 0x42, 0x8329],
+            ),
+            (
+                "fe80::1ff:fe23:4567:890a",
+                vec![0xfe80, 0x0, 0x0, 0x0, 0x1ff, 0xfe23, 0x4567, 0x890a],
+            ),
+            (
+                "0:0:0:0:0:0:0:0",
+                vec![0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0],
+            ),
+            (
+                "0:0:0:0:0:0:0:1",
+                vec![0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1],
+            ),
+            (
+                "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff",
+                vec![
+                    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+                ],
+            ),
+        ];
+
+        for (addr, expected) in test_cases.into_iter() {
+            let result = parse_ip_v6(addr).unwrap();
+            assert_eq!(result, ("", expected),);
+        }
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
index eaf960e..d201498 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,8 +1,8 @@
+pub mod iri;
 mod shared;
 mod string_parser;
 mod triple_common_parser;
 pub mod turtle;
-
 pub mod prelude {
     use nom::error::VerboseError;
     pub use nom::{
@@ -14,7 +14,7 @@ pub mod prelude {
         character::{
             complete::{
                 alphanumeric1, char, i64 as I64, line_ending, multispace0, multispace1, space0,
-                space1, u32 as U32,
+                space1, u16 as U16, u32 as U32, u8 as U8,
             },
             is_alphanumeric, is_space,
         },
diff --git a/src/tests/mod.rs b/src/tests/mod.rs
index 278996f..b18546b 100644
--- a/src/tests/mod.rs
+++ b/src/tests/mod.rs
@@ -80,13 +80,13 @@ fn cmp_input_file(
                     .replace(" ", "<SPACE>"),
             );
         }
-        assert_eq!(input.len(), output.len());
         if !diff.is_empty() {
             println!("========== Differences ==========");
             println!("{diff}");
             println!("========== Differences ==========");
         }
         assert_eq!(diff.len(), 0);
+        assert_eq!(input.len(), output.len());
     }
 }
 mod triple_common_parser_test_misc;
diff --git a/src/tests/turtle_doc_test.rs b/src/tests/turtle_doc_test.rs
index 5455633..ba334d8 100644
--- a/src/tests/turtle_doc_test.rs
+++ b/src/tests/turtle_doc_test.rs
@@ -32,6 +32,7 @@ const INPUT_DIR: &str = "examples/turtle_doc";
 #[test_case("0025", None          , false ; "EQ: test date 20/09/2012")]
 #[test_case("0026", None          , false ; "EQ: test date 2023-08-30T10:31:00.080Z")]
 #[test_case("0027", None          , true  ; "JSON: test simple json result with bnode")]
+// #[test_case("0028", None          , false  ; "The following Turtle document contains examples of all the different ways of writing IRIs in Turtle.")]
 #[serial]
 fn test_turtle_doc(test_name: &str, diff_file: Option<&str>, output_json: bool) {
     reset_fake_uuid_gen();