Skip to content

Commit

Permalink
(|a)* shouldn't match more text than (|a)+ does!
Browse files Browse the repository at this point in the history
Fixes #310.

Change-Id: I16c500e4b4c67a9f01dc9403773f3a40356ecc4d
Reviewed-on: https://code-review.googlesource.com/c/re2/+/58590
Reviewed-by: Paul Wankadia <[email protected]>
  • Loading branch information
junyer committed May 12, 2021
1 parent aa45633 commit bc42365
Show file tree
Hide file tree
Showing 4 changed files with 126 additions and 65 deletions.
76 changes: 48 additions & 28 deletions re2/compile.cc
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,11 @@ static const PatchList kNullPatchList = {0, 0};
struct Frag {
uint32_t begin;
PatchList end;
bool nullable;

Frag() : begin(0) { end.head = 0; } // needed so Frag can go in vector
Frag(uint32_t begin, PatchList end) : begin(begin), end(end) {}
Frag() : begin(0), end(kNullPatchList), nullable(false) {}
Frag(uint32_t begin, PatchList end, bool nullable)
: begin(begin), end(end), nullable(nullable) {}
};

// Input encodings.
Expand Down Expand Up @@ -264,7 +266,7 @@ int Compiler::AllocInst(int n) {

// Returns an unmatchable fragment.
Frag Compiler::NoMatch() {
return Frag(0, kNullPatchList);
return Frag();
}

// Is a an unmatchable fragment?
Expand All @@ -290,11 +292,11 @@ Frag Compiler::Cat(Frag a, Frag b) {
// To run backward over string, reverse all concatenations.
if (reversed_) {
PatchList::Patch(inst_.data(), b.end, a.begin);
return Frag(b.begin, a.end);
return Frag(b.begin, a.end, b.nullable && a.nullable);
}

PatchList::Patch(inst_.data(), a.end, b.begin);
return Frag(a.begin, b.end);
return Frag(a.begin, b.end, a.nullable && b.nullable);
}

// Given fragments for a and b, returns fragment for a|b.
Expand All @@ -310,7 +312,8 @@ Frag Compiler::Alt(Frag a, Frag b) {
return NoMatch();

inst_[id].InitAlt(a.begin, b.begin);
return Frag(id, PatchList::Append(inst_.data(), a.end, b.end));
return Frag(id, PatchList::Append(inst_.data(), a.end, b.end),
a.nullable || b.nullable);
}

// When capturing submatches in like-Perl mode, a kOpAlt Inst
Expand All @@ -320,27 +323,44 @@ Frag Compiler::Alt(Frag a, Frag b) {
// then the operator is greedy. If out1_ is the repetition
// (and out_ moves forward), then the operator is non-greedy.

// Given a fragment a, returns a fragment for a* or a*? (if nongreedy)
Frag Compiler::Star(Frag a, bool nongreedy) {
// Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy)
Frag Compiler::Plus(Frag a, bool nongreedy) {
int id = AllocInst(1);
if (id < 0)
return NoMatch();
inst_[id].InitAlt(0, 0);
PatchList::Patch(inst_.data(), a.end, id);
PatchList pl;
if (nongreedy) {
inst_[id].out1_ = a.begin;
return Frag(id, PatchList::Mk(id << 1));
inst_[id].InitAlt(0, a.begin);
pl = PatchList::Mk(id << 1);
} else {
inst_[id].set_out(a.begin);
return Frag(id, PatchList::Mk((id << 1) | 1));
inst_[id].InitAlt(a.begin, 0);
pl = PatchList::Mk((id << 1) | 1);
}
PatchList::Patch(inst_.data(), a.end, id);
return Frag(a.begin, pl, a.nullable);
}

// Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy)
Frag Compiler::Plus(Frag a, bool nongreedy) {
// a+ is just a* with a different entry point.
Frag f = Star(a, nongreedy);
return Frag(a.begin, f.end);
// Given a fragment for a, returns a fragment for a* or a*? (if nongreedy)
Frag Compiler::Star(Frag a, bool nongreedy) {
// When the subexpression is nullable, one Alt isn't enough to guarantee
// correct priority ordering within the transitive closure. The simplest
// solution is to handle it as (a+)? instead, which adds the second Alt.
if (a.nullable)
return Quest(Plus(a, nongreedy), nongreedy);

int id = AllocInst(1);
if (id < 0)
return NoMatch();
PatchList pl;
if (nongreedy) {
inst_[id].InitAlt(0, a.begin);
pl = PatchList::Mk(id << 1);
} else {
inst_[id].InitAlt(a.begin, 0);
pl = PatchList::Mk((id << 1) | 1);
}
PatchList::Patch(inst_.data(), a.end, id);
return Frag(id, pl, true);
}

// Given a fragment for a, returns a fragment for a? or a?? (if nongreedy)
Expand All @@ -358,7 +378,7 @@ Frag Compiler::Quest(Frag a, bool nongreedy) {
inst_[id].InitAlt(a.begin, 0);
pl = PatchList::Mk((id << 1) | 1);
}
return Frag(id, PatchList::Append(inst_.data(), pl, a.end));
return Frag(id, PatchList::Append(inst_.data(), pl, a.end), true);
}

// Returns a fragment for the byte range lo-hi.
Expand All @@ -367,7 +387,7 @@ Frag Compiler::ByteRange(int lo, int hi, bool foldcase) {
if (id < 0)
return NoMatch();
inst_[id].InitByteRange(lo, hi, foldcase, 0);
return Frag(id, PatchList::Mk(id << 1));
return Frag(id, PatchList::Mk(id << 1), false);
}

// Returns a no-op fragment. Sometimes unavoidable.
Expand All @@ -376,7 +396,7 @@ Frag Compiler::Nop() {
if (id < 0)
return NoMatch();
inst_[id].InitNop(0);
return Frag(id, PatchList::Mk(id << 1));
return Frag(id, PatchList::Mk(id << 1), true);
}

// Returns a fragment that signals a match.
Expand All @@ -385,7 +405,7 @@ Frag Compiler::Match(int32_t match_id) {
if (id < 0)
return NoMatch();
inst_[id].InitMatch(match_id);
return Frag(id, kNullPatchList);
return Frag(id, kNullPatchList, false);
}

// Returns a fragment matching a particular empty-width op (like ^ or $)
Expand All @@ -394,7 +414,7 @@ Frag Compiler::EmptyWidth(EmptyOp empty) {
if (id < 0)
return NoMatch();
inst_[id].InitEmptyWidth(empty, 0);
return Frag(id, PatchList::Mk(id << 1));
return Frag(id, PatchList::Mk(id << 1), true);
}

// Given a fragment a, returns a fragment with capturing parens around a.
Expand All @@ -408,7 +428,7 @@ Frag Compiler::Capture(Frag a, int n) {
inst_[id+1].InitCapture(2*n+1, 0);
PatchList::Patch(inst_.data(), a.end, id+1);

return Frag(id, PatchList::Mk((id+1) << 1));
return Frag(id, PatchList::Mk((id+1) << 1), a.nullable);
}

// A Rune is a name for a Unicode code point.
Expand Down Expand Up @@ -567,15 +587,15 @@ bool Compiler::ByteRangeEqual(int id1, int id2) {
Frag Compiler::FindByteRange(int root, int id) {
if (inst_[root].opcode() == kInstByteRange) {
if (ByteRangeEqual(root, id))
return Frag(root, kNullPatchList);
return Frag(root, kNullPatchList, false);
else
return NoMatch();
}

while (inst_[root].opcode() == kInstAlt) {
int out1 = inst_[root].out1();
if (ByteRangeEqual(out1, id))
return Frag(root, PatchList::Mk((root << 1) | 1));
return Frag(root, PatchList::Mk((root << 1) | 1), false);

// CharClass is a sorted list of ranges, so if out1 of the root Alt wasn't
// what we're looking for, then we can stop immediately. Unfortunately, we
Expand All @@ -587,7 +607,7 @@ Frag Compiler::FindByteRange(int root, int id) {
if (inst_[out].opcode() == kInstAlt)
root = out;
else if (ByteRangeEqual(out, id))
return Frag(root, PatchList::Mk(root << 1));
return Frag(root, PatchList::Mk(root << 1), false);
else
return NoMatch();
}
Expand Down
66 changes: 45 additions & 21 deletions re2/testing/compile_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,20 @@ static Test tests[] = {
{ "[[-`]",
"3. byte [5b-60] 0 -> 4\n"
"4. match! 0\n" },
// Issue 310
{ "(?:|a)*",
"3+ nop -> 7\n"
"4. nop -> 9\n"
"5+ nop -> 7\n"
"6. nop -> 9\n"
"7+ nop -> 5\n"
"8. byte [61-61] 0 -> 5\n"
"9. match! 0\n" },
{ "(?:|a)+",
"3+ nop -> 5\n"
"4. byte [61-61] 0 -> 5\n"
"5+ nop -> 3\n"
"6. match! 0\n" },
};

TEST(TestRegexpCompileToProg, Simple) {
Expand Down Expand Up @@ -338,27 +352,37 @@ TEST(TestCompile, Bug35237384) {
forward);

Dump("(a*|b*)*{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
EXPECT_EQ("3+ nop -> 6\n"
"4+ nop -> 8\n"
"5. nop -> 21\n"
"6+ byte [61-61] 1 -> 6\n"
"7. nop -> 3\n"
"8+ byte [62-62] 1 -> 8\n"
"9. nop -> 3\n"
"10+ byte [61-61] 1 -> 10\n"
"11. nop -> 21\n"
"12+ byte [62-62] 1 -> 12\n"
"13. nop -> 21\n"
"14+ byte [61-61] 1 -> 14\n"
"15. nop -> 18\n"
"16+ byte [62-62] 1 -> 16\n"
"17. nop -> 18\n"
"18+ nop -> 14\n"
"19+ nop -> 16\n"
"20. match! 0\n"
"21+ nop -> 10\n"
"22+ nop -> 12\n"
"23. nop -> 18\n",
EXPECT_EQ("3+ nop -> 28\n"
"4. nop -> 30\n"
"5+ byte [61-61] 1 -> 5\n"
"6. nop -> 32\n"
"7+ byte [61-61] 1 -> 7\n"
"8. nop -> 26\n"
"9+ byte [61-61] 1 -> 9\n"
"10. nop -> 20\n"
"11+ byte [62-62] 1 -> 11\n"
"12. nop -> 20\n"
"13+ byte [62-62] 1 -> 13\n"
"14. nop -> 26\n"
"15+ byte [62-62] 1 -> 15\n"
"16. nop -> 32\n"
"17+ nop -> 9\n"
"18. nop -> 11\n"
"19. match! 0\n"
"20+ nop -> 17\n"
"21. nop -> 19\n"
"22+ nop -> 7\n"
"23. nop -> 13\n"
"24+ nop -> 17\n"
"25. nop -> 19\n"
"26+ nop -> 22\n"
"27. nop -> 24\n"
"28+ nop -> 5\n"
"29. nop -> 15\n"
"30+ nop -> 22\n"
"31. nop -> 24\n"
"32+ nop -> 28\n"
"33. nop -> 30\n",
forward);

Dump("((|S.+)+|(|S.+)+|){2}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
Expand Down
47 changes: 31 additions & 16 deletions re2/testing/re2_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -485,37 +485,37 @@ TEST(ProgramFanout, BigProgram) {

std::vector<int> histogram;

// 3 is the largest non-empty bucket and has 1 element.
// 3 is the largest non-empty bucket and has 2 element.
ASSERT_EQ(3, re1.ProgramFanout(&histogram));
ASSERT_EQ(1, histogram[3]);
ASSERT_EQ(2, histogram[3]);

// 6 is the largest non-empty bucket and has 10 elements.
// 6 is the largest non-empty bucket and has 11 elements.
ASSERT_EQ(6, re10.ProgramFanout(&histogram));
ASSERT_EQ(10, histogram[6]);
ASSERT_EQ(11, histogram[6]);

// 9 is the largest non-empty bucket and has 100 elements.
// 9 is the largest non-empty bucket and has 101 elements.
ASSERT_EQ(9, re100.ProgramFanout(&histogram));
ASSERT_EQ(100, histogram[9]);
ASSERT_EQ(101, histogram[9]);

// 13 is the largest non-empty bucket and has 1000 elements.
// 13 is the largest non-empty bucket and has 1001 elements.
ASSERT_EQ(13, re1000.ProgramFanout(&histogram));
ASSERT_EQ(1000, histogram[13]);
ASSERT_EQ(1001, histogram[13]);

// 2 is the largest non-empty bucket and has 1 element.
// 2 is the largest non-empty bucket and has 2 element.
ASSERT_EQ(2, re1.ReverseProgramFanout(&histogram));
ASSERT_EQ(1, histogram[2]);
ASSERT_EQ(2, histogram[2]);

// 5 is the largest non-empty bucket and has 10 elements.
// 5 is the largest non-empty bucket and has 11 elements.
ASSERT_EQ(5, re10.ReverseProgramFanout(&histogram));
ASSERT_EQ(10, histogram[5]);
ASSERT_EQ(11, histogram[5]);

// 9 is the largest non-empty bucket and has 100 elements.
// 9 is the largest non-empty bucket and has 101 elements.
ASSERT_EQ(9, re100.ReverseProgramFanout(&histogram));
ASSERT_EQ(100, histogram[9]);
ASSERT_EQ(101, histogram[9]);

// 12 is the largest non-empty bucket and has 1000 elements.
// 12 is the largest non-empty bucket and has 1001 elements.
ASSERT_EQ(12, re1000.ReverseProgramFanout(&histogram));
ASSERT_EQ(1000, histogram[12]);
ASSERT_EQ(1001, histogram[12]);
}

// Issue 956519: handling empty character sets was
Expand Down Expand Up @@ -1641,4 +1641,19 @@ TEST(RE2, Issue104) {
ASSERT_EQ("小人小类小", s);
}

TEST(RE2, Issue310) {
// (?:|a)* matched more text than (?:|a)+ did.

std::string s = "aaa";
StringPiece m;

RE2 star("(?:|a)*");
ASSERT_TRUE(star.Match(s, 0, s.size(), RE2::UNANCHORED, &m, 1));
ASSERT_EQ(m, "") << " got m='" << m << "', want ''";

RE2 plus("(?:|a)+");
ASSERT_TRUE(plus.Match(s, 0, s.size(), RE2::UNANCHORED, &m, 1));
ASSERT_EQ(m, "") << " got m='" << m << "', want ''";
}

} // namespace re2
2 changes: 2 additions & 0 deletions re2/testing/search_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,8 @@ RegexpTest simple_tests[] = {
// Former bugs.
{ "a\\C*|ba\\C", "baba" },
{ "\\w*I\\w*", "Inc." },
{ "(?:|a)*", "aaa" },
{ "(?:|a)+", "aaa" },
};

TEST(Regexp, SearchTests) {
Expand Down

0 comments on commit bc42365

Please sign in to comment.