Skip to content

Commit

Permalink
Add a regular expression engine (#222)
Browse files Browse the repository at this point in the history
* Add a regular expression engine

* Fix off by one error

* Add + quantifier

* Add more tests

* Use assert_eq instead of assert in tests

* Rewrite tests with an array

* Fix bug in is_match_star

* Use the same do while equivalent in is_match_plus

* Add ? quantifier

* Refactor engine code

* Add backslash char

* Group ifs in match

* Add special escaped chars

* Add doc

* Add find command

* Add Match#find

* Show multiple matches in the same line

* Dry Regex

* Change matches color

* Add greedy version of matching by default

* Add MetaChar enum to fix matching escaped chars

* Change function signatures

* Remove macro_export

* Add TODO

* Find matching lines recursively

* Handle special patterns
  • Loading branch information
vinc authored Aug 1, 2021
1 parent 72f9baa commit de48c87
Show file tree
Hide file tree
Showing 7 changed files with 447 additions and 1 deletion.
21 changes: 21 additions & 0 deletions doc/regex.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# MOROS Regular Expression Engine

MOROS include a simplified regular expression engine with the following syntax:

- `\` escape the following character to its literal meaning
- `^` matches the starting position within the string
- `$` matches the ending position within the string
- `*` matches the preceding element zero or more times
- `+` matches the preceding element one or more times
- `?` matches the preceding element zero or one time
- `.` matches any single character
- `\w` matches any alphanumeric character
- `\W` matches any non-alphanumeric character
- `\d` matches any numeric character
- `\D` matches any non-numeric character
- `\w` matches any whitespace character
- `\W` matches any whitespace character

The engine is UTF-8 aware, so for example the unicode character `é` will be
matched by `\w` even if it's not present in the ASCII table and has a size
of two bytes.
2 changes: 2 additions & 0 deletions src/api/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,7 @@ pub mod console;
pub mod font;
pub mod fs;
pub mod prompt;
pub mod regex;
pub mod syscall;
pub mod vga;
// TODO: add mod wildcard
284 changes: 284 additions & 0 deletions src/api/regex.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,284 @@
use alloc::string::{String, ToString};
use alloc::vec::Vec;
use core::convert::From;
use core::ops::RangeBounds;

// TODO: Remove this when tests are done
const DEBUG: bool = false;
macro_rules! debug {
($($arg:tt)*) => ({
if DEBUG {
println!("{}", format_args!($($arg)*));
}
});
}

// See "A Regular Expression Matcher" by Rob Pike and Brian Kernighan (2007)

#[derive(Debug)]
enum MetaChar {
Any,
Numeric,
Whitespace,
Alphanumeric,
NonNumeric,
NonWhitespace,
NonAlphanumeric,
Literal(char),
}

impl From<char> for MetaChar {
fn from(c: char) -> Self {
match c {
'.' => MetaChar::Any,
_ => MetaChar::Literal(c),
}
}
}

trait MetaCharExt {
fn from_escaped(c: char) -> Self;
fn contains(&self, c: char) -> bool;
}

impl MetaCharExt for MetaChar {
fn from_escaped(c: char) -> Self {
match c {
'd' => MetaChar::Numeric,
's' => MetaChar::Whitespace,
'w' => MetaChar::Alphanumeric,
'D' => MetaChar::NonNumeric,
'S' => MetaChar::NonWhitespace,
'W' => MetaChar::NonAlphanumeric,
_ => MetaChar::Literal(c),
}
}
fn contains(&self, c: char) -> bool {
match self {
MetaChar::Any => true,
MetaChar::Numeric => c.is_numeric(),
MetaChar::Whitespace => c.is_whitespace(),
MetaChar::Alphanumeric => c.is_alphanumeric(),
MetaChar::NonNumeric => !c.is_numeric(),
MetaChar::NonWhitespace => !c.is_whitespace(),
MetaChar::NonAlphanumeric => !c.is_alphanumeric(),
MetaChar::Literal(lc) => c == *lc,
}
}
}

#[derive(Debug)]
pub struct Regex(String);

impl Regex {
pub fn new(re: &str) -> Self {
debug!("debug: Regex::new({:?})", re);
Self(re.to_string())
}
pub fn is_match(&self, text: &str) -> bool {
self.find(text).is_some()
}
pub fn find(&self, text: &str) -> Option<(usize, usize)> {
let vec_re: Vec<char> = self.0.chars().collect();
let vec_text: Vec<char> = text.chars().collect();
let mut start = 0;
let mut end = 0;
if is_match(&vec_re[..], &vec_text[..], &mut start, &mut end) {
Some((start, end))
} else {
None
}
}
}

fn is_match(re: &[char], text: &[char], start: &mut usize, end: &mut usize) -> bool {
debug!("debug: is_match({:?}, {:?})", re, text);
if re.len() == 0 {
return true;
}
if re[0] == '^' {
*end = 1;
return is_match_here(&re[1..], text, end);
}
let mut i = 0;
let n = text.len();
loop {
*start = i;
*end = i;
if is_match_here(re, &text[i..], end) {
return true;
}
if i == n {
return false;
}
i += 1;
}
}

fn is_match_here(re: &[char], text: &[char], end: &mut usize) -> bool {
debug!("debug: is_match_here({:?}, {:?})", re, text);
if re.len() == 0 {
return true;
}
if re[0] == '$' {
return text.len() == 0;
}
let (mc, i) = if re.len() > 1 && re[0] == '\\' {
(MetaChar::from_escaped(re[1]), 1)
} else {
(MetaChar::from(re[0]), 0)
};
if re.len() > i + 1 {
let lazy = re.len() > i + 2 && re[i + 2] == '?';
let j = if lazy { i + 3 } else { i + 2 };

match re[i + 1] {
'*' => return is_match_star(lazy, mc, &re[j..], text, end),
'+' => return is_match_plus(lazy, mc, &re[j..], text, end),
'?' => return is_match_ques(lazy, mc, &re[j..], text, end),
_ => {}
}
}
if text.len() != 0 && mc.contains(text[0]) {
*end += 1;
let j = i + 1;
return is_match_here(&re[j..], &text[1..], end);
}
false
}

fn is_match_star(lazy: bool, mc: MetaChar, re: &[char], text: &[char], end: &mut usize) -> bool {
debug!("debug: is_match_star({:?}, {:?}, {:?}", mc, re, text);
is_match_char(lazy, mc, re, text, .., end)
}

fn is_match_plus(lazy: bool, mc: MetaChar, re: &[char], text: &[char], end: &mut usize) -> bool {
debug!("debug: is_match_plus({:?}, {:?}, {:?}", mc, re, text);
is_match_char(lazy, mc, re, text, 1.., end)
}

fn is_match_ques(lazy: bool, mc: MetaChar, re: &[char], text: &[char], end: &mut usize) -> bool {
debug!("debug: is_match_ques({:?}, {:?}, {:?}", mc, re, text);
is_match_char(lazy, mc, re, text, ..2, end)
}

fn is_match_char<T: RangeBounds<usize>>(lazy: bool, mc: MetaChar, re: &[char], text: &[char], range: T, end: &mut usize) -> bool {
debug!("debug: is_match_char({:?}, {:?}, {:?}", mc, re, text);
let mut i = 0;
let n = text.len();

if !lazy {
loop {
if i == n || !(mc.contains(text[i])) {
break;
}
i += 1;
}
}

loop {
if is_match_here(re, &text[i..], end) && range.contains(&i) {
*end += i;
return true;
}
if lazy {
if i == n || !(mc.contains(text[i])) {
return false;
}
i += 1;
} else {
if i == 0 {
return false;
}
i -= 1;
}
}
}

#[test_case]
fn test_regex() {
let tests = [
("", "aaa", true),
("", "", true),
("aaa", "aaa", true),
("aaa", "bbb", false),
("a.a", "aaa", true),
("a.a", "aba", true),
("a.a", "abb", false),

("a*", "aaa", true),
("a*b", "aab", true),
("a*b*", "aabb", true),
("a*b*", "bb", true),
("a.*", "abb", true),
(".*", "aaa", true),
("a.*", "a", true),

("a.+", "ab", true),
("a.+", "abb", true),
("a.+", "a", false),
("a.+b", "ab", false),
("a.+b", "abb", true),
(".+", "abb", true),
(".+", "b", true),

("a?b", "abb", true),
("a?b", "bb", true),
("a?b", "aabb", true),

("^a.*a$", "aaa", true),
("^#.*", "#aaa", true),
("^#.*", "a#aaa", false),
(".*;$", "aaa;", true),
(".*;$", "aaa;a", false),
("^.*$", "aaa", true),

("a.b", "abb", true),
("a.b", "a.b", true),
("a\\.b", "abb", false),
("a\\.b", "a.b", true),
("a\\\\.b", "abb", false),
("a\\\\.b", "a.b", false),
("a\\\\.b", "a\\bb", true),
("a\\\\.b", "a\\.b", true),
("a\\\\\\.b", "a\\bb", false),
("a\\\\\\.b", "a\\.b", true),
("a\\\\\\.b", "a\\\\bb", false),
("a\\\\\\.b", "a\\\\.b", false),
("a\\\\\\\\.b", "a\\bb", false),
("a\\\\\\\\.b", "a\\.b", false),
("a\\\\\\\\.b", "a\\\\bb", true),
("a\\\\\\\\.b", "a\\\\.b", true),

("a\\wb", "aéb", true),
("a\\wb", "awb", true),
("a\\wb", "abb", true),
("a\\wb", "a1b", true),
("a\\wb", "a.b", false),
("a\\Wb", "aWb", false),
("a\\Wb", "abb", false),
("a\\Wb", "a1b", false),
("a\\Wb", "a.b", true),
("a\\db", "abb", false),
("a\\db", "a1b", true),
("a\\Db", "abb", true),
("a\\Db", "a1b", false),
("a\\sb", "abb", false),
("a\\sb", "a b", true),
("a\\Sb", "abb", true),
("a\\Sb", "a b", false),

("a\\.*d", "a..d", true),
("a\\.*d", "a.cd", false),
("a\\w*d", "abcd", true),
];
for (re, text, is_match) in tests {
assert!(Regex::new(re).is_match(text) == is_match, "Regex::new(\"{}\").is_match(\"{}\") == {}", re, text, is_match);
}

assert_eq!(Regex::new(".*").find("abcd"), Some((0, 4)));
assert_eq!(Regex::new("b.*c").find("aaabbbcccddd"), Some((3, 9)));
assert_eq!(Regex::new("b.*?c").find("aaabbbcccddd"), Some((3, 7)));
assert_eq!(Regex::new("a\\w*d").find("abcdabcd"), Some((0, 8)));
assert_eq!(Regex::new("a\\w*?d").find("abcdabcd"), Some((0, 4)));
}
4 changes: 4 additions & 0 deletions src/sys/fs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,10 @@ impl File {
None
}

pub fn name(&self) -> String {
self.name.clone()
}

pub fn size(&self) -> usize {
self.size as usize
}
Expand Down
Loading

0 comments on commit de48c87

Please sign in to comment.