From 29b9195d886a5e1d13dc7109a002a7f8f12e5406 Mon Sep 17 00:00:00 2001
From: masklinn <github.com@masklinn.net>
Date: Tue, 9 Jul 2024 18:49:35 +0200
Subject: [PATCH] Center for rewriting regex who can't ASCII good and memory
 issues too
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Per previous commits on the rough comparisons between regex-filtered
and re2, while regex-filtered is very competitive indeed on the CPU
side it suffers from memory usage issues.

This stems from two issues:

character classes
=================

`re2` uses [ASCII-only perl character classes][1], regex uses
[full-unicode Perl character classes][2] defined in terms of [UTS#18
properties][3], this leads to *much* large state graphs for `\d` and
`\w` (`\s` seems to cause much less trouble).

While uap-core doesn't actually specify regex semantics, [Javascript
perl character classes are ASCII-only][4].

As such, a valid mitigation *for ua-parser* is to convert `\d`, `\D`,
`\w`, and `\W` to the corresponding ASCII classes (I used literal
enumerations from MDN but POSIX-style classes [would have worked
too][5]). This was way helped by regex supporting [nesting enumerated
character classes][6] as it means I don't need to special-case
expanding perl-style character classes inside enumerations.

Because capture amplifies the issue, this conversion reduces memory
consumption by between 30% for non-captured digits:

    > echo -n "\d+" | cargo r -qr -- -q
    13496 8826
    > echo -n "[0-9]+" | cargo r -qr -- -q
    10946 1322

and *two orders of magnitude* for captured word characters:

    > echo -n "(\w+)" | cargo r -qr -- -q
    605008 73786
    > echo -n "([a-zA-Z0-9_]+)" | cargo r -qr -- -q
    6968 3332

Bounded repetitions
===================

A large amount of bounded repetitions (`{a,b}`) was added to
regexes.yaml [for catastrophic backtracking migitation][7]. While this
is nice for backracking based engines, it's not relevant to regex
which is based on finite automata, however bounded repetitions *does*
cost significantly more than unbounded repetitions:

    > echo -n ".+" | cargo r -qr -- -q
    7784 4838
    > echo -n ".{0,100}" | cargo r -qr -- -q
    140624 118326

And this also compounds with the previous item when bounded repetition
is used with a perl character class (although that's not very common
in `regexes.yaml`, it's mostly tacked on `.`).

This can be mitigated by converting "large" bounded repetitions
(arbitrarily defined as an upper bound of two digits or more) to
unbounded repetitions.

Results
=======

The end results of that work is a 22% reduction in peak memory
footprint when running ua-parser over the entire sample using core's
`regexes.yaml`... and even a ~4% gain in runtime despite doing more
work up-front and not optimising for that[^1].

before
------

    > /usr/bin/time -l ../target/release/examples/bench -r 10 ~/sources/thirdparty/uap-core/regexes.yaml ~/sources/thirdparty/uap-python/samples/useragents.txt
    Lines: 751580
    Total time: 9.363202625s
    12µs / line
        9.71 real         9.64 user         0.04 sys
           254590976  maximum resident set size
                   0  average shared memory size
                   0  average unshared data size
                   0  average unshared stack size
               15647  page reclaims
                  13  page faults
                   0  swaps
                   0  block input operations
                   0  block output operations
                   0  messages sent
                   0  messages received
                   0  signals received
                   0  voluntary context switches
                  33  involuntary context switches
         84520306010  instructions retired
         31154406450  cycles elapsed
           245909184  peak memory footprint

after
-----

> /usr/bin/time -l ../target/release/examples/bench -r 10 ~/sources/thirdparty/uap-core/regexes.yaml ~/sources/thirdparty/uap-python/samples/useragents.txt
Lines: 751580
Total time: 8.754590666s
11µs / line
        9.37 real         8.95 user         0.03 sys
           196198400  maximum resident set size
                   0  average shared memory size
                   0  average unshared data size
                   0  average unshared stack size
               12083  page reclaims
                  13  page faults
                   0  swaps
                   0  block input operations
                   0  block output operations
                   0  messages sent
                   0  messages received
                   0  signals received
                  11  voluntary context switches
                  40  involuntary context switches
         80119011397  instructions retired
         28903938853  cycles elapsed
           192169408  peak memory footprint

the world that almost was
-------------------------

Sadly as it turns out there are a few large-ish *functional* bounded
repetitions, for instance

    ; {0,2}(moto)(.{0,50})(?: Build|\) AppleWebKit)

mis-captures if it's converted to `.*`. This means my original
threshold of converting any repetition with two digits upper bound was
a bust and I had to move up to 3 (there are no upper bounds above 50
but below 100). Opened ua-parser/uap-core#596 in case this could be
improved with a cleaner project-supported signal.

With the original two-digit versions, we reached *47%* peak memory
footprint reduction and 9% runtime improvement:

    > /usr/bin/time -l ../target/release/examples/bench -r 10 ~/sources/thirdparty/uap-core/regexes.yaml ~/sources/thirdparty/uap-python/samples/useragents.txt
    Lines: 751580
    Total time: 8.541360667s
    11µs / line
        8.75 real         8.70 user         0.02 sys
           135331840  maximum resident set size
                   0  average shared memory size
                   0  average unshared data size
                   0  average unshared stack size
                8367  page reclaims
                  13  page faults
                   0  swaps
                   0  block input operations
                   0  block output operations
                   0  messages sent
                   0  messages received
                   0  signals received
                   0  voluntary context switches
                  25  involuntary context switches
         78422091147  instructions retired
         28079764502  cycles elapsed
           130106688  peak memory footprint

Fixes #2

[^1]: that surprised me but the gains seem consistent from one run to
    the next and we can clearly see a reduction in both cycles elapsed
    and instructions retired so I'll take it ¯\_(ツ)_/¯ IPC even
    increases slightly from 2.7 to 2.8 yipee

[1]: https://github.com/google/re2/wiki/Syntax
[2]: https://docs.rs/regex/latest/regex/#perl-character-classes-unicode-friendly
[3]: https://www.unicode.org/reports/tr18/#Compatibility_Properties
[4]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_expressions/Character_classes
[5]: https://docs.rs/regex/latest/regex/#ascii-character-classes
[6]: https://docs.rs/regex/latest/regex/#character-classes
[7]: https://github.com/ua-parser/uap-core/commit/6e65445448586361f6c5b3389ef619249d56e070
---
 ua-parser/src/lib.rs | 138 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 135 insertions(+), 3 deletions(-)

diff --git a/ua-parser/src/lib.rs b/ua-parser/src/lib.rs
index f63eab6..7b413ad 100644
--- a/ua-parser/src/lib.rs
+++ b/ua-parser/src/lib.rs
@@ -192,7 +192,7 @@ pub mod user_agent {
         /// Pushes a parser into the builder, may fail if the
         /// [`Parser::regex`] is invalid.
         pub fn push(mut self, ua: Parser<'a>) -> Result<Self, super::Error> {
-            self.builder = self.builder.push(&ua.regex)?;
+            self.builder = self.builder.push(&super::rewrite_regex(&ua.regex))?;
             let r = &self.builder.regexes()[self.builder.regexes().len() - 1];
             // number of groups in regex, excluding implicit entire match group
             let groups = r.captures_len() - 1;
@@ -357,7 +357,7 @@ pub mod os {
         /// be parsed, or if [`Parser::os_replacement`] is missing and
         /// the regex has no groups.
         pub fn push(mut self, os: Parser<'a>) -> Result<Self, ParseError> {
-            self.builder = self.builder.push(&os.regex)?;
+            self.builder = self.builder.push(&super::rewrite_regex(&os.regex))?;
             let r = &self.builder.regexes()[self.builder.regexes().len() - 1];
             // number of groups in regex, excluding implicit entire match group
             let groups = r.captures_len() - 1;
@@ -523,7 +523,7 @@ pub mod device {
         /// which [`Parser::regex`] is missing.
         pub fn push(mut self, device: Parser<'a>) -> Result<Self, ParseError> {
             self.builder = self.builder.push_opt(
-                &device.regex,
+                &super::rewrite_regex(&device.regex),
                 regex_filtered::Options::new()
                     .case_insensitive(device.regex_flag == Some(Flag::IgnoreCase)),
             )?;
@@ -607,3 +607,135 @@ pub mod device {
         pub model: Option<String>,
     }
 }
+
+/// Rewrites a regex's character classes to ascii and bounded
+/// repetitions to unbounded, the second to reduce regex memory
+/// requirements, and the first for both that and to better match the
+/// (inferred) semantics intended for ua-parser.
+fn rewrite_regex(re: &str) -> std::borrow::Cow<'_, str> {
+    let mut from = 0;
+    let mut out = String::new();
+
+    let mut it = re.char_indices();
+    let mut escape = false;
+    let mut inclass = 0;
+    'main: while let Some((idx, c)) = it.next() {
+        match c {
+            '\\' if !escape => {
+                escape = true;
+                continue
+            }
+            '{' if !escape && inclass == 0 => {
+                if idx == 0 {
+                    // we're repeating nothing, this regex is broken, bail
+                    return re.into()
+                }
+                // we don't need to loop, we only want to replace {0, ...} and {1, ...}
+                let Some((_, start)) = it.next() else {
+                    continue;
+                };
+                if start != '0' && start != '1' {
+                    continue;
+                }
+
+                if !matches!(it.next(), Some((_, ','))) {
+                    continue;
+                }
+
+                let mut digits = 0;
+                for (ri, rc) in it.by_ref() {
+                    match rc {
+                        '}' if digits > 2 => {
+                            // here idx is the index of the start of
+                            // the range and ri is the end of range
+                            out.push_str(&re[from..idx]);
+                            from = ri+1;
+                            out.push_str(if start == '0' { "*" } else { "+" });
+                            break;
+                        }
+                        c if c.is_ascii_digit() => {
+                            digits += 1;
+                        }
+                        _ => {
+                            continue 'main
+                        }
+                    }
+                }
+            }
+            '[' if !escape => { inclass += 1; }
+            ']' if !escape => { inclass += 1; }
+            // no need for special cases because regex allows nesting
+            // character classes, whereas js or python don't \o/
+            'd' if escape => {
+                // idx is d so idx-1 is \\, and we want to exclude it
+                out.push_str(&re[from..idx-1]);
+                from = idx+1;
+                out.push_str("[0-9]");
+            }
+            'D' if escape => {
+                out.push_str(&re[from..idx-1]);
+                from = idx+1;
+                out.push_str("[^0-9]");
+            }
+            'w' if escape => {
+                out.push_str(&re[from..idx-1]);
+                from = idx+1;
+                out.push_str("[A-Za-z0-9_]");
+            }
+            'W' if escape => {
+                out.push_str(&re[from..idx-1]);
+                from = idx+1;
+                out.push_str("[^A-Za-z0-9_]");
+            }
+            _ => ()
+        }
+        escape = false;
+    }
+
+    if from == 0 {
+        re.into()
+    } else {
+        out.push_str(&re[from..]);
+        out.into()
+    }
+}
+
+#[cfg(test)]
+mod test_rewrite_regex {
+    use super::rewrite_regex as rewrite;
+
+    #[test]
+    fn ignore_small_repetition() {
+        assert_eq!(rewrite(".{0,2}x"), ".{0,2}x");
+        assert_eq!(rewrite(".{0,}"), ".{0,}");
+        assert_eq!(rewrite(".{1,}"), ".{1,}");
+    }
+
+    #[test]
+    fn rewrite_large_repetitions() {
+        assert_eq!(rewrite(".{0,20}x"), ".{0,20}x");
+        assert_eq!(rewrite("(.{0,100})"), "(.*)");
+        assert_eq!(rewrite("(.{1,50})"), "(.{1,50})");
+        assert_eq!(rewrite(".{1,300}x"), ".+x");
+    }
+
+    #[test]
+    fn ignore_non_repetitions() {
+        assert_eq!(
+            rewrite(r"\{1,2}"),
+            r"\{1,2}",
+            "if the opening brace is escaped it's not a repetition");
+        assert_eq!(
+            rewrite("[.{1,100}]"),
+            "[.{1,100}]",
+            "inside a set it's not a repetition"
+        );
+    }
+
+    #[test]
+    fn rewrite_classes() {
+        assert_eq!(rewrite(r"\dx"), "[0-9]x");
+        assert_eq!(rewrite(r"\wx"), "[A-Za-z0-9_]x");
+        assert_eq!(rewrite(r"[\d]x"), r"[[0-9]]x");
+    }
+}