Skip to content

Commit

Permalink
released 3.10.1
Browse files Browse the repository at this point in the history
fix a performance issue with specific regex patterns when used with case-insensitive pattern matching.
  • Loading branch information
genivia-inc committed Mar 17, 2023
1 parent 1c131c3 commit 95f4462
Show file tree
Hide file tree
Showing 6 changed files with 73 additions and 32 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5250,7 +5250,7 @@ in markdown:



ugrep 3.10.0 February 28, 2023 UGREP(1)
ugrep 3.10.1 March 17, 2023 UGREP(1)

🔝 [Back to table of contents](#toc)

Expand Down
Binary file modified bin/win32/ugrep.exe
Binary file not shown.
Binary file modified bin/win64/ugrep.exe
Binary file not shown.
99 changes: 70 additions & 29 deletions lib/pattern.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,25 @@ void Pattern::init(const char *options, const uint8_t *pred)
{
// all patterns are strings, do not construct a DFA with subset construction
start = tfa_.root();
if (opt_.i)
{
// convert edges to case-insensitive by adding upper case transitions for alphas normalized to lower case
timer_type et;
timer_start(et);
for (DFA::State *state = start; state; state = state->next)
{
for (DFA::State::Edges::iterator t = state->edges.begin(); t != state->edges.end(); ++t)
{
Char c = t->first;
if (c >= 'a' && c <= 'z')
{
state->edges[uppercase(c)] = std::pair<Char,DFA::State*>(uppercase(c), t->second.second);
++eno_;
}
}
}
ems_ += timer_elapsed(et);
}
}
else
{
Expand Down Expand Up @@ -462,27 +481,21 @@ void Pattern::parse(
c = lowercase(c);
}
#ifdef WITH_TREE_DFA
DFA::State *target_state;
DFA::State::Edges::iterator i = t->edges.find(c);
if (i == t->edges.end())
{
if (last_state == NULL)
last_state = t; // t points to the tree DFA start state
target_state = last_state = last_state->next = tfa_.state();
DFA::State *target_state = last_state = last_state->next = tfa_.state();
t->edges[c] = std::pair<Char,DFA::State*>(c, target_state);
if (c >= 'a' && c <= 'z' && opt_.i)
{
t->edges[uppercase(c)] = std::pair<Char,DFA::State*>(uppercase(c), target_state);
++eno_;
}
t = target_state;
++eno_;
++vno_;
}
else
{
target_state = i->second.second;
t = i->second.second;
}
t = target_state;
#else
t = tfa_.edge(t, c);
#endif
Expand Down Expand Up @@ -545,11 +558,11 @@ void Pattern::parse(
else if (at(loc) != 0)
error(regex_error::invalid_syntax, loc);
if (opt_.i)
update_modified(ModConst::i, modifiers, 0, len - 1);
update_modified(ModConst::i, modifiers, 0, len);
if (opt_.m)
update_modified(ModConst::m, modifiers, 0, len - 1);
update_modified(ModConst::m, modifiers, 0, len);
if (opt_.s)
update_modified(ModConst::s, modifiers, 0, len - 1);
update_modified(ModConst::s, modifiers, 0, len);
pms_ = timer_elapsed(t);
#ifdef DEBUG
DBGLOGN("startpos = {");
Expand Down Expand Up @@ -1389,15 +1402,28 @@ void Pattern::compile(
if (moves.empty())
{
// no DFA transitions: the final DFA transitions are the tree DFA transitions to target states
for (DFA::State::Edges::iterator t = state->tnode->edges.begin(); t != state->tnode->edges.end(); ++t)
if (opt_.i)
{
Char c = t->first;
DFA::State *target_state = last_state = last_state->next = dfa_.state(t->second.second);
state->edges[c] = std::pair<Char,DFA::State*>(c, target_state);
++eno_;
if (opt_.i && c >= 'a' && c <= 'z')
for (DFA::State::Edges::iterator t = state->tnode->edges.begin(); t != state->tnode->edges.end(); ++t)
{
state->edges[uppercase(c)] = std::pair<Char,DFA::State*>(uppercase(c), target_state);
Char c = t->first;
DFA::State *target_state = last_state = last_state->next = dfa_.state(t->second.second);
state->edges[c] = std::pair<Char,DFA::State*>(c, target_state);
if (c >= 'a' && c <= 'z')
{
state->edges[uppercase(c)] = std::pair<Char,DFA::State*>(uppercase(c), target_state);
++eno_;
}
++eno_;
}
}
else
{
for (DFA::State::Edges::iterator t = state->tnode->edges.begin(); t != state->tnode->edges.end(); ++t)
{
Char c = t->first;
DFA::State *target_state = last_state = last_state->next = dfa_.state(t->second.second);
state->edges[c] = std::pair<Char,DFA::State*>(c, target_state);
++eno_;
}
}
Expand Down Expand Up @@ -1490,19 +1516,34 @@ void Pattern::compile(
{
Char lo = chars.lo();
Char hi = chars.hi();
for (Char c = lo; c <= hi; ++c)
if (opt_.i)
{
if (chars.contains(c))
for (Char c = lo; c <= hi; ++c)
{
DFA::State *target_state = last_state = last_state->next = dfa_.state(state->tnode->edges[c].second);
if (opt_.i && std::isalpha(c))
if (chars.contains(c))
{
state->edges[lowercase(c)] = std::pair<Char,DFA::State*>(lowercase(c), target_state);
state->edges[uppercase(c)] = std::pair<Char,DFA::State*>(uppercase(c), target_state);
eno_ += 2;
DFA::State *target_state = last_state = last_state->next = dfa_.state(state->tnode->edges[c].second);
if (std::isalpha(c))
{
state->edges[lowercase(c)] = std::pair<Char,DFA::State*>(lowercase(c), target_state);
state->edges[uppercase(c)] = std::pair<Char,DFA::State*>(uppercase(c), target_state);
eno_ += 2;
}
else
{
state->edges[c] = std::pair<Char,DFA::State*>(c, target_state);
++eno_;
}
}
else
}
}
else
{
for (Char c = lo; c <= hi; ++c)
{
if (chars.contains(c))
{
DFA::State *target_state = last_state = last_state->next = dfa_.state(state->tnode->edges[c].second);
state->edges[c] = std::pair<Char,DFA::State*>(c, target_state);
++eno_;
}
Expand Down Expand Up @@ -3382,7 +3423,7 @@ void Pattern::export_code() const
{
::fprintf(file, "#ifndef REFLEX_CODE_DECL\n#include <reflex/pattern.h>\n#define REFLEX_CODE_DECL const reflex::Pattern::Opcode\n#endif\n\n");
write_namespace_open(file);
::fprintf(file, "extern REFLEX_CODE_DECL reflex_code_%s[%u] =\n{\n", opt_.n.empty() ? "FSM" : opt_.n.c_str(), nop_);
::fprintf(file, "REFLEX_CODE_DECL reflex_code_%s[%u] =\n{\n", opt_.n.empty() ? "FSM" : opt_.n.c_str(), nop_);
for (Index i = 0; i < nop_; ++i)
{
Opcode opcode = opc_[i];
Expand Down Expand Up @@ -3657,7 +3698,7 @@ void Pattern::gen_predict_match_transitions(size_t level, DFA::State *state, ORa

void Pattern::write_predictor(FILE *file) const
{
::fprintf(file, "extern const reflex::Pattern::Pred reflex_pred_%s[%zu] = {", opt_.n.empty() ? "FSM" : opt_.n.c_str(), 2 + len_ + (min_ > 1 && len_ == 0) * 256 + (min_ > 0) * Const::HASH);
::fprintf(file, "const reflex::Pattern::Pred reflex_pred_%s[%zu] = {", opt_.n.empty() ? "FSM" : opt_.n.c_str(), 2 + len_ + (min_ > 1 && len_ == 0) * 256 + (min_ > 0) * Const::HASH);
::fprintf(file, "\n %3hhu,%3hhu,", static_cast<uint8_t>(len_), (static_cast<uint8_t>(min_ | (one_ << 4))));
for (size_t i = 0; i < len_; ++i)
::fprintf(file, "%s%3hhu,", ((i + 2) & 0xF) ? "" : "\n ", static_cast<uint8_t>(pre_[i]));
Expand Down
2 changes: 1 addition & 1 deletion man/ugrep.1
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
.TH UGREP "1" "February 28, 2023" "ugrep 3.10.0" "User Commands"
.TH UGREP "1" "March 17, 2023" "ugrep 3.10.1" "User Commands"
.SH NAME
\fBugrep\fR, \fBug\fR -- file pattern searcher
.SH SYNOPSIS
Expand Down
2 changes: 1 addition & 1 deletion src/ugrep.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
#define UGREP_HPP

// ugrep version
#define UGREP_VERSION "3.10.0"
#define UGREP_VERSION "3.10.1"

// disable mmap because mmap is almost always slower than the file reading speed improvements since 3.0.0
#define WITH_NO_MMAP
Expand Down

0 comments on commit 95f4462

Please sign in to comment.