Skip to content

Commit

Permalink
api: introduce new regex-lite crate
Browse files Browse the repository at this point in the history
Closes #961
  • Loading branch information
BurntSushi committed May 22, 2023
1 parent a60aeb6 commit 31d5df0
Show file tree
Hide file tree
Showing 29 changed files with 7,381 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,8 @@ jobs:
- name: Run subset of regex-automata tests
if: matrix.build != 'win-gnu' # Just horrifically slow.
run: ${{ env.CARGO }} test --verbose --manifest-path regex-automata/Cargo.toml $TARGET
- name: Run regex-lite tests
run: ${{ env.CARGO }} test --verbose --manifest-path regex-lite/Cargo.toml $TARGET

# This job runs a stripped down version of CI to test the MSRV. The specific
# reason for doing this is that the regex crate's dev-dependencies tend to
Expand Down
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ members = [
"regex-automata",
"regex-capi",
"regex-cli",
"regex-lite",
"regex-syntax",
"regex-test",
]
Expand Down
1 change: 1 addition & 0 deletions regex-cli/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ log = { version = "0.4.17", features = ["std"] }
memmap2 = "0.5.10"
regex = { path = ".." }
regex-automata = { path = "../regex-automata", features = ["logging"] }
regex-lite = { path = "../regex-lite" }
regex-syntax = { path = "../regex-syntax" }
tabwriter = { version = "1.2.1", features = ["ansi_formatting"] }
textwrap = { version = "0.16.0", default-features = false }
84 changes: 84 additions & 0 deletions regex-cli/args/lite.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
use {
lexopt::{Arg, Parser},
regex_automata::util::syntax,
regex_lite::Regex,
};

use crate::args::{self, Configurable, Usage};

/// Exposes the configuration for the top-level `Regex` API.
#[derive(Debug, Default)]
pub struct Config {
size_limit: Option<usize>,
}

impl Config {
/// Builds a `Regex` from the given syntax configuration and sequence of
/// patterns. This returns an error is `patterns.len() != 1`.
///
/// Note that this also returns an error if any syntax options are set
/// that aren't supported by `regex-lite`.
pub fn from_patterns(
&self,
syntax: &syntax::Config,
patterns: &[String],
) -> anyhow::Result<Regex> {
anyhow::ensure!(
patterns.len() == 1,
"API-level regex requires exactly one pattern, \
but {} were given",
patterns.len(),
);
anyhow::ensure!(
!syntax.get_octal(),
"regex-lite does not support octal mode",
);
anyhow::ensure!(
syntax.get_utf8(),
"regex-lite does not support disabling UTF-8 mode",
);
anyhow::ensure!(
syntax.get_unicode(),
"regex-lite does not support disabling Unicode mode",
);
let mut b = regex_lite::RegexBuilder::new(&patterns[0]);
b.case_insensitive(syntax.get_case_insensitive());
b.multi_line(syntax.get_multi_line());
b.crlf(syntax.get_crlf());
b.dot_matches_new_line(syntax.get_dot_matches_new_line());
b.swap_greed(syntax.get_swap_greed());
b.ignore_whitespace(syntax.get_ignore_whitespace());
b.nest_limit(syntax.get_nest_limit());
b.size_limit(self.size_limit.unwrap_or(usize::MAX));
b.build().map_err(anyhow::Error::from)
}
}

impl Configurable for Config {
fn configure(
&mut self,
p: &mut Parser,
arg: &mut Arg,
) -> anyhow::Result<bool> {
match *arg {
Arg::Long("size-limit") => {
self.size_limit = args::parse_maybe(p, "--size-limit")?;
}
_ => return Ok(false),
}
Ok(true)
}

fn usage(&self) -> &[Usage] {
const USAGES: &'static [Usage] = &[Usage::new(
"--size-limit",
"Set a limit on heap used by a regex.",
r#"
This sets a limit, in bytes, on the heap memory used by a regex.
The special value 'none' indicates that no size limit should be imposed.
"#,
)];
USAGES
}
}
1 change: 1 addition & 0 deletions regex-cli/args/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ pub mod flags;
pub mod haystack;
pub mod hybrid;
pub mod input;
pub mod lite;
pub mod meta;
pub mod onepass;
pub mod overlapping;
Expand Down
103 changes: 103 additions & 0 deletions regex-cli/cmd/find/capture/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ USAGE:
ENGINES:
backtrack Search with the bounded backtracker regex engine.
lite Search with the regex-lite engine.
meta Search with the meta regex engine.
onepass Search with the one-pass DFA regex engine.
pikevm Search with the PikeVM regex engine.
Expand All @@ -40,6 +41,7 @@ ENGINES:
let cmd = args::next_as_command(USAGE, p)?;
match &*cmd {
"backtrack" => nfa::run_backtrack(p),
"lite" => run_lite(p),
"meta" => run_meta(p),
"onepass" => dfa::run_onepass(p),
"pikevm" => nfa::run_pikevm(p),
Expand Down Expand Up @@ -219,6 +221,107 @@ OPTIONS:
Ok(())
}

fn run_lite(p: &mut lexopt::Parser) -> anyhow::Result<()> {
const USAGE: &'static str = "\
Executes a search for full matches using the top-level regex-lite engine.
USAGE:
regex-cli find capture lite [-p <pattern> ...] <haystack-path>
regex-cli find capture lite [-p <pattern> ...] -y <haystack>
TIP:
use -h for short docs and --help for long docs
OPTIONS:
%options%
";

let mut common = args::common::Config::default();
let mut patterns = args::patterns::Config::only_flags();
let mut haystack = args::haystack::Config::default();
let mut syntax = args::syntax::Config::default();
let mut lite = args::lite::Config::default();
let mut find = super::Config::default();
args::configure(
p,
USAGE,
&mut [
&mut common,
&mut patterns,
&mut haystack,
&mut syntax,
&mut lite,
&mut find,
],
)?;

let pats = patterns.get()?;
let syn = syntax.syntax()?;
let mut table = Table::empty();
let (re, time) = util::timeitr(|| lite.from_patterns(&syn, &pats))?;
table.add("build regex time", time);

// Check that the haystack is valid UTF-8 since regex-lite doesn't support
// searching arbitrary byte sequences. (At time of writing.)
haystack.get()?.to_str()?;

// The top-level API doesn't support regex-automata's more granular Input
// abstraction.
let input = args::input::Config::default();
// The top-level API also doesn't use 'Captures' from regex-automata
// directly, but we can map between them with some annoyance.
let group_info = GroupInfo::new([re.capture_names()])
.context("could not build capture group info")?;
let mut locs = re.capture_locations();
let search = |input: &Input<'_>, caps: &mut Captures| {
let haystack = input.haystack().to_str().unwrap();
caps.set_pattern(None);
if !re.captures_read_at(&mut locs, haystack, input.start()).is_some() {
return Ok(());
}
caps.set_pattern(Some(PatternID::ZERO));
for i in 0..locs.len() {
use regex_automata::util::primitives::NonMaxUsize;

let slot_start = i * 2;
let slot_end = slot_start + 1;
match locs.get(i) {
None => {
caps.slots_mut()[slot_start] = None;
caps.slots_mut()[slot_end] = None;
}
Some((start, end)) => {
caps.slots_mut()[slot_start] = NonMaxUsize::new(start);
caps.slots_mut()[slot_end] = NonMaxUsize::new(end);
}
}
}
Ok(())
};
if find.count {
run_counts(
&mut table,
&common,
&find,
&input,
&haystack,
&group_info,
search,
)?;
} else {
run_search(
&mut table,
&common,
&find,
&input,
&haystack,
&group_info,
search,
)?;
}
Ok(())
}

/// A function that takes in a bunch of configuration, runs the given search
/// routine, and prints out a table of counts.
fn run_counts(
Expand Down
67 changes: 67 additions & 0 deletions regex-cli/cmd/find/match/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ ENGINES:
backtrack Search with the bounded backtracker regex engine.
dense Search with the dense DFA regex engine.
hybrid Search with the lazy DFA regex engine.
lite Search with the regex-lite engine.
meta Search with the meta regex engine.
onepass Search with the one-pass DFA regex engine.
pikevm Search with the PikeVM regex engine.
Expand All @@ -37,6 +38,7 @@ ENGINES:
"backtrack" => nfa::run_backtrack(p),
"dense" => dfa::run_dense(p),
"hybrid" => dfa::run_hybrid(p),
"lite" => run_lite(p),
"meta" => run_meta(p),
"onepass" => dfa::run_onepass(p),
"pikevm" => nfa::run_pikevm(p),
Expand Down Expand Up @@ -164,6 +166,71 @@ OPTIONS:
Ok(())
}

fn run_lite(p: &mut lexopt::Parser) -> anyhow::Result<()> {
const USAGE: &'static str = "\
Executes a search for full matches using the top-level regex-lite engine.
Note that since the regex-lite crate doesn't have an API for search arbitrary
byte slices, the haystack must be valid UTF-8. If it isn't, this command will
report an error.
USAGE:
regex-cli find match lite [-p <pattern> ...] <haystack-path>
regex-cli find match lite [-p <pattern> ...] -y <haystack>
TIP:
use -h for short docs and --help for long docs
OPTIONS:
%options%
";

let mut common = args::common::Config::default();
let mut patterns = args::patterns::Config::only_flags();
let mut haystack = args::haystack::Config::default();
let mut syntax = args::syntax::Config::default();
let mut lite = args::lite::Config::default();
let mut find = super::Config::default();
args::configure(
p,
USAGE,
&mut [
&mut common,
&mut patterns,
&mut haystack,
&mut syntax,
&mut lite,
&mut find,
],
)?;

let pats = patterns.get()?;
let syn = syntax.syntax()?;
let mut table = Table::empty();
let (re, time) = util::timeitr(|| lite.from_patterns(&syn, &pats))?;
table.add("build regex time", time);

// Check that the haystack is valid UTF-8 since regex-lite doesn't support
// searching arbitrary byte sequences. (At time of writing.)
haystack.get()?.to_str()?;

// The top-level regex-lite API doesn't support regex-automata's more
// granular Input abstraction.
let input = args::input::Config::default();
let search = |input: &Input<'_>| {
let haystack = input.haystack().to_str().unwrap();
Ok(re
.find_at(haystack, input.start())
.map(|m| Match::new(PatternID::ZERO, m.start()..m.end())))
};
if find.count {
run_counts(&mut table, &common, &find, &input, &haystack, 1, search)?;
} else {
run_search(&mut table, &common, &find, &input, &haystack, search)?;
}
Ok(())
}

/// A function that takes in a bunch of configuration, runs the given search
/// routine, and prints out a table of counts.
fn run_counts(
Expand Down
36 changes: 36 additions & 0 deletions regex-lite/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
[package]
name = "regex-lite"
version = "0.1.0" #:version
authors = ["The Rust Project Developers", "Andrew Gallant <[email protected]>"]
license = "MIT OR Apache-2.0"
repository = "https://github.com/rust-lang/regex/tree/master/regex-lite"
documentation = "https://docs.rs/regex-lite"
description = """
A lightweight regex engine that optimizes for binary size and compilation time.
"""
workspace = ".."
edition = "2021"
rust-version = "1.60.0"
autotests = false

# Features are documented in the "Crate features" section of the crate docs:
# https://docs.rs/regex-syntax/*/#crate-features
[features]
default = ["std"]
std = []

[dev-dependencies]
anyhow = "1.0.69"
regex-test = { path = "../regex-test", version = "0.1.0" }

[[test]]
path = "tests/lib.rs"
name = "integration"

[package.metadata.docs.rs]
# We want to document all features.
all-features = true
# To test this locally, run:
#
# RUSTDOCFLAGS="--cfg docsrs" cargo +nightly doc --all-features
rustdoc-args = ["--cfg", "docsrs"]
Loading

0 comments on commit 31d5df0

Please sign in to comment.