Add a regular expression engine (#222)

* Add a regular expression engine * Fix off by one error * Add + quantifier * Add more tests * Use assert_eq instead of assert in tests * Rewrite tests with an array * Fix bug in is_match_star * Use the same do while equivalent in is_match_plus * Add ? quantifier * Refactor engine code * Add backslash char * Group ifs in match * Add special escaped chars * Add doc * Add find command * Add Match#find * Show multiple matches in the same line * Dry Regex * Change matches color * Add greedy version of matching by default * Add MetaChar enum to fix matching escaped chars * Change function signatures * Remove macro_export * Add TODO * Find matching lines recursively * Handle special patterns
vinc · Aug 1, 2021 · de48c87 · de48c87
1 parent 72f9baa
commit de48c87
Show file tree

Hide file tree

Showing 7 changed files with 447 additions and 1 deletion.
diff --git a/doc/regex.md b/doc/regex.md
@@ -0,0 +1,21 @@
+# MOROS Regular Expression Engine
+
+MOROS include a simplified regular expression engine with the following syntax:
+
+- `\` escape the following character to its literal meaning
+- `^` matches the starting position within the string
+- `$` matches the ending position within the string
+- `*` matches the preceding element zero or more times
+- `+` matches the preceding element one or more times
+- `?` matches the preceding element zero or one time
+- `.` matches any single character
+- `\w` matches any alphanumeric character
+- `\W` matches any non-alphanumeric character
+- `\d` matches any numeric character
+- `\D` matches any non-numeric character
+- `\w` matches any whitespace character
+- `\W` matches any whitespace character
+
+The engine is UTF-8 aware, so for example the unicode character `é` will be
+matched by `\w` even if it's not present in the ASCII table and has a size
+of two bytes.
diff --git a/src/api/mod.rs b/src/api/mod.rs
@@ -20,5 +20,7 @@ pub mod console;
 pub mod font;
 pub mod fs;
 pub mod prompt;
+pub mod regex;
 pub mod syscall;
 pub mod vga;
+// TODO: add mod wildcard
diff --git a/src/api/regex.rs b/src/api/regex.rs
@@ -0,0 +1,284 @@
+use alloc::string::{String, ToString};
+use alloc::vec::Vec;
+use core::convert::From;
+use core::ops::RangeBounds;
+
+// TODO: Remove this when tests are done
+const DEBUG: bool = false;
+macro_rules! debug {
+    ($($arg:tt)*) => ({
+        if DEBUG {
+            println!("{}", format_args!($($arg)*));
+        }
+    });
+}
+
+// See "A Regular Expression Matcher" by Rob Pike and Brian Kernighan (2007)
+
+#[derive(Debug)]
+enum MetaChar {
+    Any,
+    Numeric,
+    Whitespace,
+    Alphanumeric,
+    NonNumeric,
+    NonWhitespace,
+    NonAlphanumeric,
+    Literal(char),
+}
+
+impl From<char> for MetaChar {
+    fn from(c: char) -> Self {
+        match c {
+            '.' => MetaChar::Any,
+            _   => MetaChar::Literal(c),
+        }
+    }
+}
+
+trait MetaCharExt {
+    fn from_escaped(c: char) -> Self;
+    fn contains(&self, c: char) -> bool;
+}
+
+impl MetaCharExt for MetaChar {
+    fn from_escaped(c: char) -> Self {
+        match c {
+            'd' => MetaChar::Numeric,
+            's' => MetaChar::Whitespace,
+            'w' => MetaChar::Alphanumeric,
+            'D' => MetaChar::NonNumeric,
+            'S' => MetaChar::NonWhitespace,
+            'W' => MetaChar::NonAlphanumeric,
+            _   => MetaChar::Literal(c),
+        }
+    }
+    fn contains(&self, c: char) -> bool {
+        match self {
+            MetaChar::Any => true,
+            MetaChar::Numeric => c.is_numeric(),
+            MetaChar::Whitespace => c.is_whitespace(),
+            MetaChar::Alphanumeric => c.is_alphanumeric(),
+            MetaChar::NonNumeric => !c.is_numeric(),
+            MetaChar::NonWhitespace => !c.is_whitespace(),
+            MetaChar::NonAlphanumeric => !c.is_alphanumeric(),
+            MetaChar::Literal(lc) => c == *lc,
+        }
+    }
+}
+
+#[derive(Debug)]
+pub struct Regex(String);
+
+impl Regex {
+    pub fn new(re: &str) -> Self {
+        debug!("debug: Regex::new({:?})", re);
+        Self(re.to_string())
+    }
+    pub fn is_match(&self, text: &str) -> bool {
+        self.find(text).is_some()
+    }
+    pub fn find(&self, text: &str) -> Option<(usize, usize)> {
+        let vec_re: Vec<char> = self.0.chars().collect();
+        let vec_text: Vec<char> = text.chars().collect();
+        let mut start = 0;
+        let mut end = 0;
+        if is_match(&vec_re[..], &vec_text[..], &mut start, &mut end) {
+            Some((start, end))
+        } else {
+            None
+        }
+    }
+}
+
+fn is_match(re: &[char], text: &[char], start: &mut usize, end: &mut usize) -> bool {
+    debug!("debug: is_match({:?}, {:?})", re, text);
+    if re.len() == 0 {
+        return true;
+    }
+    if re[0] == '^' {
+        *end = 1;
+        return is_match_here(&re[1..], text, end);
+    }
+    let mut i = 0;
+    let n = text.len();
+    loop {
+        *start = i;
+        *end = i;
+        if is_match_here(re, &text[i..], end) {
+            return true;
+        }
+        if i == n {
+            return false;
+        }
+        i += 1;
+    }
+}
+
+fn is_match_here(re: &[char], text: &[char], end: &mut usize) -> bool {
+    debug!("debug: is_match_here({:?}, {:?})", re, text);
+    if re.len() == 0 {
+        return true;
+    }
+    if re[0] == '$' {
+        return text.len() == 0;
+    }
+    let (mc, i) = if re.len() > 1 && re[0] == '\\' {
+        (MetaChar::from_escaped(re[1]), 1)
+    } else {
+        (MetaChar::from(re[0]), 0)
+    };
+    if re.len() > i + 1 {
+        let lazy = re.len() > i + 2 && re[i + 2] == '?';
+        let j = if lazy { i + 3 } else { i + 2 };
+
+        match re[i + 1] {
+            '*' => return is_match_star(lazy, mc, &re[j..], text, end),
+            '+' => return is_match_plus(lazy, mc, &re[j..], text, end),
+            '?' => return is_match_ques(lazy, mc, &re[j..], text, end),
+            _ => {}
+        }
+    }
+    if text.len() != 0 && mc.contains(text[0]) {
+        *end += 1;
+        let j = i + 1;
+        return is_match_here(&re[j..], &text[1..], end);
+    }
+    false
+}
+
+fn is_match_star(lazy: bool, mc: MetaChar, re: &[char], text: &[char], end: &mut usize) -> bool {
+    debug!("debug: is_match_star({:?}, {:?}, {:?}", mc, re, text);
+    is_match_char(lazy, mc, re, text, .., end)
+}
+
+fn is_match_plus(lazy: bool, mc: MetaChar, re: &[char], text: &[char], end: &mut usize) -> bool {
+    debug!("debug: is_match_plus({:?}, {:?}, {:?}", mc, re, text);
+    is_match_char(lazy, mc, re, text, 1.., end)
+}
+
+fn is_match_ques(lazy: bool, mc: MetaChar, re: &[char], text: &[char], end: &mut usize) -> bool {
+    debug!("debug: is_match_ques({:?}, {:?}, {:?}", mc, re, text);
+    is_match_char(lazy, mc, re, text, ..2, end)
+}
+
+fn is_match_char<T: RangeBounds<usize>>(lazy: bool, mc: MetaChar, re: &[char], text: &[char], range: T, end: &mut usize) -> bool {
+    debug!("debug: is_match_char({:?}, {:?}, {:?}", mc, re, text);
+    let mut i = 0;
+    let n = text.len();
+
+    if !lazy {
+        loop {
+            if i == n || !(mc.contains(text[i])) {
+                break;
+            }
+            i += 1;
+        }
+    }
+
+    loop {
+        if is_match_here(re, &text[i..], end) && range.contains(&i) {
+            *end += i;
+            return true;
+        }
+        if lazy {
+            if i == n || !(mc.contains(text[i])) {
+                return false;
+            }
+            i += 1;
+        } else {
+            if i == 0 {
+                return false;
+            }
+            i -= 1;
+        }
+    }
+}
+
+#[test_case]
+fn test_regex() {
+    let tests = [
+        ("",            "aaa",     true),
+        ("",            "",        true),
+        ("aaa",         "aaa",     true),
+        ("aaa",         "bbb",     false),
+        ("a.a",         "aaa",     true),
+        ("a.a",         "aba",     true),
+        ("a.a",         "abb",     false),
+
+        ("a*",          "aaa",     true),
+        ("a*b",         "aab",     true),
+        ("a*b*",        "aabb",    true),
+        ("a*b*",        "bb",      true),
+        ("a.*",         "abb",     true),
+        (".*",          "aaa",     true),
+        ("a.*",         "a",       true),
+
+        ("a.+",         "ab",      true),
+        ("a.+",         "abb",     true),
+        ("a.+",         "a",       false),
+        ("a.+b",        "ab",      false),
+        ("a.+b",        "abb",     true),
+        (".+",          "abb",     true),
+        (".+",          "b",       true),
+
+        ("a?b",         "abb",     true),
+        ("a?b",         "bb",      true),
+        ("a?b",         "aabb",    true),
+
+        ("^a.*a$",      "aaa",     true),
+        ("^#.*",        "#aaa",    true),
+        ("^#.*",        "a#aaa",   false),
+        (".*;$",        "aaa;",    true),
+        (".*;$",        "aaa;a",   false),
+        ("^.*$",        "aaa",     true),
+
+        ("a.b",         "abb",     true),
+        ("a.b",         "a.b",     true),
+        ("a\\.b",       "abb",     false),
+        ("a\\.b",       "a.b",     true),
+        ("a\\\\.b",     "abb",     false),
+        ("a\\\\.b",     "a.b",     false),
+        ("a\\\\.b",     "a\\bb",   true),
+        ("a\\\\.b",     "a\\.b",   true),
+        ("a\\\\\\.b",   "a\\bb",   false),
+        ("a\\\\\\.b",   "a\\.b",   true),
+        ("a\\\\\\.b",   "a\\\\bb", false),
+        ("a\\\\\\.b",   "a\\\\.b", false),
+        ("a\\\\\\\\.b", "a\\bb",   false),
+        ("a\\\\\\\\.b", "a\\.b",   false),
+        ("a\\\\\\\\.b", "a\\\\bb", true),
+        ("a\\\\\\\\.b", "a\\\\.b", true),
+
+        ("a\\wb",       "aéb",     true),
+        ("a\\wb",       "awb",     true),
+        ("a\\wb",       "abb",     true),
+        ("a\\wb",       "a1b",     true),
+        ("a\\wb",       "a.b",     false),
+        ("a\\Wb",       "aWb",     false),
+        ("a\\Wb",       "abb",     false),
+        ("a\\Wb",       "a1b",     false),
+        ("a\\Wb",       "a.b",     true),
+        ("a\\db",       "abb",     false),
+        ("a\\db",       "a1b",     true),
+        ("a\\Db",       "abb",     true),
+        ("a\\Db",       "a1b",     false),
+        ("a\\sb",       "abb",     false),
+        ("a\\sb",       "a b",     true),
+        ("a\\Sb",       "abb",     true),
+        ("a\\Sb",       "a b",     false),
+
+        ("a\\.*d",      "a..d",    true),
+        ("a\\.*d",      "a.cd",    false),
+        ("a\\w*d",      "abcd",    true),
+    ];
+    for (re, text, is_match) in tests {
+        assert!(Regex::new(re).is_match(text) == is_match, "Regex::new(\"{}\").is_match(\"{}\") == {}", re, text, is_match);
+    }
+
+    assert_eq!(Regex::new(".*").find("abcd"), Some((0, 4)));
+    assert_eq!(Regex::new("b.*c").find("aaabbbcccddd"), Some((3, 9)));
+    assert_eq!(Regex::new("b.*?c").find("aaabbbcccddd"), Some((3, 7)));
+    assert_eq!(Regex::new("a\\w*d").find("abcdabcd"), Some((0, 8)));
+    assert_eq!(Regex::new("a\\w*?d").find("abcdabcd"), Some((0, 4)));
+}
diff --git a/src/sys/fs.rs b/src/sys/fs.rs
@@ -91,6 +91,10 @@ impl File {
         None
     }
 
+    pub fn name(&self) -> String {
+        self.name.clone()
+    }
+
     pub fn size(&self) -> usize {
         self.size as usize
     }