-
Notifications
You must be signed in to change notification settings - Fork 33
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add a regular expression engine (#222)
* Add a regular expression engine * Fix off by one error * Add + quantifier * Add more tests * Use assert_eq instead of assert in tests * Rewrite tests with an array * Fix bug in is_match_star * Use the same do while equivalent in is_match_plus * Add ? quantifier * Refactor engine code * Add backslash char * Group ifs in match * Add special escaped chars * Add doc * Add find command * Add Match#find * Show multiple matches in the same line * Dry Regex * Change matches color * Add greedy version of matching by default * Add MetaChar enum to fix matching escaped chars * Change function signatures * Remove macro_export * Add TODO * Find matching lines recursively * Handle special patterns
- Loading branch information
Showing
7 changed files
with
447 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
# MOROS Regular Expression Engine | ||
|
||
MOROS include a simplified regular expression engine with the following syntax: | ||
|
||
- `\` escape the following character to its literal meaning | ||
- `^` matches the starting position within the string | ||
- `$` matches the ending position within the string | ||
- `*` matches the preceding element zero or more times | ||
- `+` matches the preceding element one or more times | ||
- `?` matches the preceding element zero or one time | ||
- `.` matches any single character | ||
- `\w` matches any alphanumeric character | ||
- `\W` matches any non-alphanumeric character | ||
- `\d` matches any numeric character | ||
- `\D` matches any non-numeric character | ||
- `\w` matches any whitespace character | ||
- `\W` matches any whitespace character | ||
|
||
The engine is UTF-8 aware, so for example the unicode character `é` will be | ||
matched by `\w` even if it's not present in the ASCII table and has a size | ||
of two bytes. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,284 @@ | ||
use alloc::string::{String, ToString}; | ||
use alloc::vec::Vec; | ||
use core::convert::From; | ||
use core::ops::RangeBounds; | ||
|
||
// TODO: Remove this when tests are done | ||
const DEBUG: bool = false; | ||
macro_rules! debug { | ||
($($arg:tt)*) => ({ | ||
if DEBUG { | ||
println!("{}", format_args!($($arg)*)); | ||
} | ||
}); | ||
} | ||
|
||
// See "A Regular Expression Matcher" by Rob Pike and Brian Kernighan (2007) | ||
|
||
#[derive(Debug)] | ||
enum MetaChar { | ||
Any, | ||
Numeric, | ||
Whitespace, | ||
Alphanumeric, | ||
NonNumeric, | ||
NonWhitespace, | ||
NonAlphanumeric, | ||
Literal(char), | ||
} | ||
|
||
impl From<char> for MetaChar { | ||
fn from(c: char) -> Self { | ||
match c { | ||
'.' => MetaChar::Any, | ||
_ => MetaChar::Literal(c), | ||
} | ||
} | ||
} | ||
|
||
trait MetaCharExt { | ||
fn from_escaped(c: char) -> Self; | ||
fn contains(&self, c: char) -> bool; | ||
} | ||
|
||
impl MetaCharExt for MetaChar { | ||
fn from_escaped(c: char) -> Self { | ||
match c { | ||
'd' => MetaChar::Numeric, | ||
's' => MetaChar::Whitespace, | ||
'w' => MetaChar::Alphanumeric, | ||
'D' => MetaChar::NonNumeric, | ||
'S' => MetaChar::NonWhitespace, | ||
'W' => MetaChar::NonAlphanumeric, | ||
_ => MetaChar::Literal(c), | ||
} | ||
} | ||
fn contains(&self, c: char) -> bool { | ||
match self { | ||
MetaChar::Any => true, | ||
MetaChar::Numeric => c.is_numeric(), | ||
MetaChar::Whitespace => c.is_whitespace(), | ||
MetaChar::Alphanumeric => c.is_alphanumeric(), | ||
MetaChar::NonNumeric => !c.is_numeric(), | ||
MetaChar::NonWhitespace => !c.is_whitespace(), | ||
MetaChar::NonAlphanumeric => !c.is_alphanumeric(), | ||
MetaChar::Literal(lc) => c == *lc, | ||
} | ||
} | ||
} | ||
|
||
#[derive(Debug)] | ||
pub struct Regex(String); | ||
|
||
impl Regex { | ||
pub fn new(re: &str) -> Self { | ||
debug!("debug: Regex::new({:?})", re); | ||
Self(re.to_string()) | ||
} | ||
pub fn is_match(&self, text: &str) -> bool { | ||
self.find(text).is_some() | ||
} | ||
pub fn find(&self, text: &str) -> Option<(usize, usize)> { | ||
let vec_re: Vec<char> = self.0.chars().collect(); | ||
let vec_text: Vec<char> = text.chars().collect(); | ||
let mut start = 0; | ||
let mut end = 0; | ||
if is_match(&vec_re[..], &vec_text[..], &mut start, &mut end) { | ||
Some((start, end)) | ||
} else { | ||
None | ||
} | ||
} | ||
} | ||
|
||
fn is_match(re: &[char], text: &[char], start: &mut usize, end: &mut usize) -> bool { | ||
debug!("debug: is_match({:?}, {:?})", re, text); | ||
if re.len() == 0 { | ||
return true; | ||
} | ||
if re[0] == '^' { | ||
*end = 1; | ||
return is_match_here(&re[1..], text, end); | ||
} | ||
let mut i = 0; | ||
let n = text.len(); | ||
loop { | ||
*start = i; | ||
*end = i; | ||
if is_match_here(re, &text[i..], end) { | ||
return true; | ||
} | ||
if i == n { | ||
return false; | ||
} | ||
i += 1; | ||
} | ||
} | ||
|
||
fn is_match_here(re: &[char], text: &[char], end: &mut usize) -> bool { | ||
debug!("debug: is_match_here({:?}, {:?})", re, text); | ||
if re.len() == 0 { | ||
return true; | ||
} | ||
if re[0] == '$' { | ||
return text.len() == 0; | ||
} | ||
let (mc, i) = if re.len() > 1 && re[0] == '\\' { | ||
(MetaChar::from_escaped(re[1]), 1) | ||
} else { | ||
(MetaChar::from(re[0]), 0) | ||
}; | ||
if re.len() > i + 1 { | ||
let lazy = re.len() > i + 2 && re[i + 2] == '?'; | ||
let j = if lazy { i + 3 } else { i + 2 }; | ||
|
||
match re[i + 1] { | ||
'*' => return is_match_star(lazy, mc, &re[j..], text, end), | ||
'+' => return is_match_plus(lazy, mc, &re[j..], text, end), | ||
'?' => return is_match_ques(lazy, mc, &re[j..], text, end), | ||
_ => {} | ||
} | ||
} | ||
if text.len() != 0 && mc.contains(text[0]) { | ||
*end += 1; | ||
let j = i + 1; | ||
return is_match_here(&re[j..], &text[1..], end); | ||
} | ||
false | ||
} | ||
|
||
fn is_match_star(lazy: bool, mc: MetaChar, re: &[char], text: &[char], end: &mut usize) -> bool { | ||
debug!("debug: is_match_star({:?}, {:?}, {:?}", mc, re, text); | ||
is_match_char(lazy, mc, re, text, .., end) | ||
} | ||
|
||
fn is_match_plus(lazy: bool, mc: MetaChar, re: &[char], text: &[char], end: &mut usize) -> bool { | ||
debug!("debug: is_match_plus({:?}, {:?}, {:?}", mc, re, text); | ||
is_match_char(lazy, mc, re, text, 1.., end) | ||
} | ||
|
||
fn is_match_ques(lazy: bool, mc: MetaChar, re: &[char], text: &[char], end: &mut usize) -> bool { | ||
debug!("debug: is_match_ques({:?}, {:?}, {:?}", mc, re, text); | ||
is_match_char(lazy, mc, re, text, ..2, end) | ||
} | ||
|
||
fn is_match_char<T: RangeBounds<usize>>(lazy: bool, mc: MetaChar, re: &[char], text: &[char], range: T, end: &mut usize) -> bool { | ||
debug!("debug: is_match_char({:?}, {:?}, {:?}", mc, re, text); | ||
let mut i = 0; | ||
let n = text.len(); | ||
|
||
if !lazy { | ||
loop { | ||
if i == n || !(mc.contains(text[i])) { | ||
break; | ||
} | ||
i += 1; | ||
} | ||
} | ||
|
||
loop { | ||
if is_match_here(re, &text[i..], end) && range.contains(&i) { | ||
*end += i; | ||
return true; | ||
} | ||
if lazy { | ||
if i == n || !(mc.contains(text[i])) { | ||
return false; | ||
} | ||
i += 1; | ||
} else { | ||
if i == 0 { | ||
return false; | ||
} | ||
i -= 1; | ||
} | ||
} | ||
} | ||
|
||
#[test_case] | ||
fn test_regex() { | ||
let tests = [ | ||
("", "aaa", true), | ||
("", "", true), | ||
("aaa", "aaa", true), | ||
("aaa", "bbb", false), | ||
("a.a", "aaa", true), | ||
("a.a", "aba", true), | ||
("a.a", "abb", false), | ||
|
||
("a*", "aaa", true), | ||
("a*b", "aab", true), | ||
("a*b*", "aabb", true), | ||
("a*b*", "bb", true), | ||
("a.*", "abb", true), | ||
(".*", "aaa", true), | ||
("a.*", "a", true), | ||
|
||
("a.+", "ab", true), | ||
("a.+", "abb", true), | ||
("a.+", "a", false), | ||
("a.+b", "ab", false), | ||
("a.+b", "abb", true), | ||
(".+", "abb", true), | ||
(".+", "b", true), | ||
|
||
("a?b", "abb", true), | ||
("a?b", "bb", true), | ||
("a?b", "aabb", true), | ||
|
||
("^a.*a$", "aaa", true), | ||
("^#.*", "#aaa", true), | ||
("^#.*", "a#aaa", false), | ||
(".*;$", "aaa;", true), | ||
(".*;$", "aaa;a", false), | ||
("^.*$", "aaa", true), | ||
|
||
("a.b", "abb", true), | ||
("a.b", "a.b", true), | ||
("a\\.b", "abb", false), | ||
("a\\.b", "a.b", true), | ||
("a\\\\.b", "abb", false), | ||
("a\\\\.b", "a.b", false), | ||
("a\\\\.b", "a\\bb", true), | ||
("a\\\\.b", "a\\.b", true), | ||
("a\\\\\\.b", "a\\bb", false), | ||
("a\\\\\\.b", "a\\.b", true), | ||
("a\\\\\\.b", "a\\\\bb", false), | ||
("a\\\\\\.b", "a\\\\.b", false), | ||
("a\\\\\\\\.b", "a\\bb", false), | ||
("a\\\\\\\\.b", "a\\.b", false), | ||
("a\\\\\\\\.b", "a\\\\bb", true), | ||
("a\\\\\\\\.b", "a\\\\.b", true), | ||
|
||
("a\\wb", "aéb", true), | ||
("a\\wb", "awb", true), | ||
("a\\wb", "abb", true), | ||
("a\\wb", "a1b", true), | ||
("a\\wb", "a.b", false), | ||
("a\\Wb", "aWb", false), | ||
("a\\Wb", "abb", false), | ||
("a\\Wb", "a1b", false), | ||
("a\\Wb", "a.b", true), | ||
("a\\db", "abb", false), | ||
("a\\db", "a1b", true), | ||
("a\\Db", "abb", true), | ||
("a\\Db", "a1b", false), | ||
("a\\sb", "abb", false), | ||
("a\\sb", "a b", true), | ||
("a\\Sb", "abb", true), | ||
("a\\Sb", "a b", false), | ||
|
||
("a\\.*d", "a..d", true), | ||
("a\\.*d", "a.cd", false), | ||
("a\\w*d", "abcd", true), | ||
]; | ||
for (re, text, is_match) in tests { | ||
assert!(Regex::new(re).is_match(text) == is_match, "Regex::new(\"{}\").is_match(\"{}\") == {}", re, text, is_match); | ||
} | ||
|
||
assert_eq!(Regex::new(".*").find("abcd"), Some((0, 4))); | ||
assert_eq!(Regex::new("b.*c").find("aaabbbcccddd"), Some((3, 9))); | ||
assert_eq!(Regex::new("b.*?c").find("aaabbbcccddd"), Some((3, 7))); | ||
assert_eq!(Regex::new("a\\w*d").find("abcdabcd"), Some((0, 8))); | ||
assert_eq!(Regex::new("a\\w*?d").find("abcdabcd"), Some((0, 4))); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.