From bc423653fdf28618554da96e1532662d1e33eaca Mon Sep 17 00:00:00 2001 From: Paul Wankadia Date: Wed, 12 May 2021 08:59:23 -0700 Subject: [PATCH] (|a)* shouldn't match more text than (|a)+ does! Fixes #310. Change-Id: I16c500e4b4c67a9f01dc9403773f3a40356ecc4d Reviewed-on: https://code-review.googlesource.com/c/re2/+/58590 Reviewed-by: Paul Wankadia --- re2/compile.cc | 76 +++++++++++++++++++++++-------------- re2/testing/compile_test.cc | 66 ++++++++++++++++++++++---------- re2/testing/re2_test.cc | 47 +++++++++++++++-------- re2/testing/search_test.cc | 2 + 4 files changed, 126 insertions(+), 65 deletions(-) diff --git a/re2/compile.cc b/re2/compile.cc index 7a9de0728..f3624b9a7 100644 --- a/re2/compile.cc +++ b/re2/compile.cc @@ -79,9 +79,11 @@ static const PatchList kNullPatchList = {0, 0}; struct Frag { uint32_t begin; PatchList end; + bool nullable; - Frag() : begin(0) { end.head = 0; } // needed so Frag can go in vector - Frag(uint32_t begin, PatchList end) : begin(begin), end(end) {} + Frag() : begin(0), end(kNullPatchList), nullable(false) {} + Frag(uint32_t begin, PatchList end, bool nullable) + : begin(begin), end(end), nullable(nullable) {} }; // Input encodings. @@ -264,7 +266,7 @@ int Compiler::AllocInst(int n) { // Returns an unmatchable fragment. Frag Compiler::NoMatch() { - return Frag(0, kNullPatchList); + return Frag(); } // Is a an unmatchable fragment? @@ -290,11 +292,11 @@ Frag Compiler::Cat(Frag a, Frag b) { // To run backward over string, reverse all concatenations. if (reversed_) { PatchList::Patch(inst_.data(), b.end, a.begin); - return Frag(b.begin, a.end); + return Frag(b.begin, a.end, b.nullable && a.nullable); } PatchList::Patch(inst_.data(), a.end, b.begin); - return Frag(a.begin, b.end); + return Frag(a.begin, b.end, a.nullable && b.nullable); } // Given fragments for a and b, returns fragment for a|b. @@ -310,7 +312,8 @@ Frag Compiler::Alt(Frag a, Frag b) { return NoMatch(); inst_[id].InitAlt(a.begin, b.begin); - return Frag(id, PatchList::Append(inst_.data(), a.end, b.end)); + return Frag(id, PatchList::Append(inst_.data(), a.end, b.end), + a.nullable || b.nullable); } // When capturing submatches in like-Perl mode, a kOpAlt Inst @@ -320,27 +323,44 @@ Frag Compiler::Alt(Frag a, Frag b) { // then the operator is greedy. If out1_ is the repetition // (and out_ moves forward), then the operator is non-greedy. -// Given a fragment a, returns a fragment for a* or a*? (if nongreedy) -Frag Compiler::Star(Frag a, bool nongreedy) { +// Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy) +Frag Compiler::Plus(Frag a, bool nongreedy) { int id = AllocInst(1); if (id < 0) return NoMatch(); - inst_[id].InitAlt(0, 0); - PatchList::Patch(inst_.data(), a.end, id); + PatchList pl; if (nongreedy) { - inst_[id].out1_ = a.begin; - return Frag(id, PatchList::Mk(id << 1)); + inst_[id].InitAlt(0, a.begin); + pl = PatchList::Mk(id << 1); } else { - inst_[id].set_out(a.begin); - return Frag(id, PatchList::Mk((id << 1) | 1)); + inst_[id].InitAlt(a.begin, 0); + pl = PatchList::Mk((id << 1) | 1); } + PatchList::Patch(inst_.data(), a.end, id); + return Frag(a.begin, pl, a.nullable); } -// Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy) -Frag Compiler::Plus(Frag a, bool nongreedy) { - // a+ is just a* with a different entry point. - Frag f = Star(a, nongreedy); - return Frag(a.begin, f.end); +// Given a fragment for a, returns a fragment for a* or a*? (if nongreedy) +Frag Compiler::Star(Frag a, bool nongreedy) { + // When the subexpression is nullable, one Alt isn't enough to guarantee + // correct priority ordering within the transitive closure. The simplest + // solution is to handle it as (a+)? instead, which adds the second Alt. + if (a.nullable) + return Quest(Plus(a, nongreedy), nongreedy); + + int id = AllocInst(1); + if (id < 0) + return NoMatch(); + PatchList pl; + if (nongreedy) { + inst_[id].InitAlt(0, a.begin); + pl = PatchList::Mk(id << 1); + } else { + inst_[id].InitAlt(a.begin, 0); + pl = PatchList::Mk((id << 1) | 1); + } + PatchList::Patch(inst_.data(), a.end, id); + return Frag(id, pl, true); } // Given a fragment for a, returns a fragment for a? or a?? (if nongreedy) @@ -358,7 +378,7 @@ Frag Compiler::Quest(Frag a, bool nongreedy) { inst_[id].InitAlt(a.begin, 0); pl = PatchList::Mk((id << 1) | 1); } - return Frag(id, PatchList::Append(inst_.data(), pl, a.end)); + return Frag(id, PatchList::Append(inst_.data(), pl, a.end), true); } // Returns a fragment for the byte range lo-hi. @@ -367,7 +387,7 @@ Frag Compiler::ByteRange(int lo, int hi, bool foldcase) { if (id < 0) return NoMatch(); inst_[id].InitByteRange(lo, hi, foldcase, 0); - return Frag(id, PatchList::Mk(id << 1)); + return Frag(id, PatchList::Mk(id << 1), false); } // Returns a no-op fragment. Sometimes unavoidable. @@ -376,7 +396,7 @@ Frag Compiler::Nop() { if (id < 0) return NoMatch(); inst_[id].InitNop(0); - return Frag(id, PatchList::Mk(id << 1)); + return Frag(id, PatchList::Mk(id << 1), true); } // Returns a fragment that signals a match. @@ -385,7 +405,7 @@ Frag Compiler::Match(int32_t match_id) { if (id < 0) return NoMatch(); inst_[id].InitMatch(match_id); - return Frag(id, kNullPatchList); + return Frag(id, kNullPatchList, false); } // Returns a fragment matching a particular empty-width op (like ^ or $) @@ -394,7 +414,7 @@ Frag Compiler::EmptyWidth(EmptyOp empty) { if (id < 0) return NoMatch(); inst_[id].InitEmptyWidth(empty, 0); - return Frag(id, PatchList::Mk(id << 1)); + return Frag(id, PatchList::Mk(id << 1), true); } // Given a fragment a, returns a fragment with capturing parens around a. @@ -408,7 +428,7 @@ Frag Compiler::Capture(Frag a, int n) { inst_[id+1].InitCapture(2*n+1, 0); PatchList::Patch(inst_.data(), a.end, id+1); - return Frag(id, PatchList::Mk((id+1) << 1)); + return Frag(id, PatchList::Mk((id+1) << 1), a.nullable); } // A Rune is a name for a Unicode code point. @@ -567,7 +587,7 @@ bool Compiler::ByteRangeEqual(int id1, int id2) { Frag Compiler::FindByteRange(int root, int id) { if (inst_[root].opcode() == kInstByteRange) { if (ByteRangeEqual(root, id)) - return Frag(root, kNullPatchList); + return Frag(root, kNullPatchList, false); else return NoMatch(); } @@ -575,7 +595,7 @@ Frag Compiler::FindByteRange(int root, int id) { while (inst_[root].opcode() == kInstAlt) { int out1 = inst_[root].out1(); if (ByteRangeEqual(out1, id)) - return Frag(root, PatchList::Mk((root << 1) | 1)); + return Frag(root, PatchList::Mk((root << 1) | 1), false); // CharClass is a sorted list of ranges, so if out1 of the root Alt wasn't // what we're looking for, then we can stop immediately. Unfortunately, we @@ -587,7 +607,7 @@ Frag Compiler::FindByteRange(int root, int id) { if (inst_[out].opcode() == kInstAlt) root = out; else if (ByteRangeEqual(out, id)) - return Frag(root, PatchList::Mk(root << 1)); + return Frag(root, PatchList::Mk(root << 1), false); else return NoMatch(); } diff --git a/re2/testing/compile_test.cc b/re2/testing/compile_test.cc index 2096e2f07..471883078 100644 --- a/re2/testing/compile_test.cc +++ b/re2/testing/compile_test.cc @@ -109,6 +109,20 @@ static Test tests[] = { { "[[-`]", "3. byte [5b-60] 0 -> 4\n" "4. match! 0\n" }, + // Issue 310 + { "(?:|a)*", + "3+ nop -> 7\n" + "4. nop -> 9\n" + "5+ nop -> 7\n" + "6. nop -> 9\n" + "7+ nop -> 5\n" + "8. byte [61-61] 0 -> 5\n" + "9. match! 0\n" }, + { "(?:|a)+", + "3+ nop -> 5\n" + "4. byte [61-61] 0 -> 5\n" + "5+ nop -> 3\n" + "6. match! 0\n" }, }; TEST(TestRegexpCompileToProg, Simple) { @@ -338,27 +352,37 @@ TEST(TestCompile, Bug35237384) { forward); Dump("(a*|b*)*{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL); - EXPECT_EQ("3+ nop -> 6\n" - "4+ nop -> 8\n" - "5. nop -> 21\n" - "6+ byte [61-61] 1 -> 6\n" - "7. nop -> 3\n" - "8+ byte [62-62] 1 -> 8\n" - "9. nop -> 3\n" - "10+ byte [61-61] 1 -> 10\n" - "11. nop -> 21\n" - "12+ byte [62-62] 1 -> 12\n" - "13. nop -> 21\n" - "14+ byte [61-61] 1 -> 14\n" - "15. nop -> 18\n" - "16+ byte [62-62] 1 -> 16\n" - "17. nop -> 18\n" - "18+ nop -> 14\n" - "19+ nop -> 16\n" - "20. match! 0\n" - "21+ nop -> 10\n" - "22+ nop -> 12\n" - "23. nop -> 18\n", + EXPECT_EQ("3+ nop -> 28\n" + "4. nop -> 30\n" + "5+ byte [61-61] 1 -> 5\n" + "6. nop -> 32\n" + "7+ byte [61-61] 1 -> 7\n" + "8. nop -> 26\n" + "9+ byte [61-61] 1 -> 9\n" + "10. nop -> 20\n" + "11+ byte [62-62] 1 -> 11\n" + "12. nop -> 20\n" + "13+ byte [62-62] 1 -> 13\n" + "14. nop -> 26\n" + "15+ byte [62-62] 1 -> 15\n" + "16. nop -> 32\n" + "17+ nop -> 9\n" + "18. nop -> 11\n" + "19. match! 0\n" + "20+ nop -> 17\n" + "21. nop -> 19\n" + "22+ nop -> 7\n" + "23. nop -> 13\n" + "24+ nop -> 17\n" + "25. nop -> 19\n" + "26+ nop -> 22\n" + "27. nop -> 24\n" + "28+ nop -> 5\n" + "29. nop -> 15\n" + "30+ nop -> 22\n" + "31. nop -> 24\n" + "32+ nop -> 28\n" + "33. nop -> 30\n", forward); Dump("((|S.+)+|(|S.+)+|){2}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL); diff --git a/re2/testing/re2_test.cc b/re2/testing/re2_test.cc index 41fccf68e..b1f7d7335 100644 --- a/re2/testing/re2_test.cc +++ b/re2/testing/re2_test.cc @@ -485,37 +485,37 @@ TEST(ProgramFanout, BigProgram) { std::vector histogram; - // 3 is the largest non-empty bucket and has 1 element. + // 3 is the largest non-empty bucket and has 2 element. ASSERT_EQ(3, re1.ProgramFanout(&histogram)); - ASSERT_EQ(1, histogram[3]); + ASSERT_EQ(2, histogram[3]); - // 6 is the largest non-empty bucket and has 10 elements. + // 6 is the largest non-empty bucket and has 11 elements. ASSERT_EQ(6, re10.ProgramFanout(&histogram)); - ASSERT_EQ(10, histogram[6]); + ASSERT_EQ(11, histogram[6]); - // 9 is the largest non-empty bucket and has 100 elements. + // 9 is the largest non-empty bucket and has 101 elements. ASSERT_EQ(9, re100.ProgramFanout(&histogram)); - ASSERT_EQ(100, histogram[9]); + ASSERT_EQ(101, histogram[9]); - // 13 is the largest non-empty bucket and has 1000 elements. + // 13 is the largest non-empty bucket and has 1001 elements. ASSERT_EQ(13, re1000.ProgramFanout(&histogram)); - ASSERT_EQ(1000, histogram[13]); + ASSERT_EQ(1001, histogram[13]); - // 2 is the largest non-empty bucket and has 1 element. + // 2 is the largest non-empty bucket and has 2 element. ASSERT_EQ(2, re1.ReverseProgramFanout(&histogram)); - ASSERT_EQ(1, histogram[2]); + ASSERT_EQ(2, histogram[2]); - // 5 is the largest non-empty bucket and has 10 elements. + // 5 is the largest non-empty bucket and has 11 elements. ASSERT_EQ(5, re10.ReverseProgramFanout(&histogram)); - ASSERT_EQ(10, histogram[5]); + ASSERT_EQ(11, histogram[5]); - // 9 is the largest non-empty bucket and has 100 elements. + // 9 is the largest non-empty bucket and has 101 elements. ASSERT_EQ(9, re100.ReverseProgramFanout(&histogram)); - ASSERT_EQ(100, histogram[9]); + ASSERT_EQ(101, histogram[9]); - // 12 is the largest non-empty bucket and has 1000 elements. + // 12 is the largest non-empty bucket and has 1001 elements. ASSERT_EQ(12, re1000.ReverseProgramFanout(&histogram)); - ASSERT_EQ(1000, histogram[12]); + ASSERT_EQ(1001, histogram[12]); } // Issue 956519: handling empty character sets was @@ -1641,4 +1641,19 @@ TEST(RE2, Issue104) { ASSERT_EQ("小人小类小", s); } +TEST(RE2, Issue310) { + // (?:|a)* matched more text than (?:|a)+ did. + + std::string s = "aaa"; + StringPiece m; + + RE2 star("(?:|a)*"); + ASSERT_TRUE(star.Match(s, 0, s.size(), RE2::UNANCHORED, &m, 1)); + ASSERT_EQ(m, "") << " got m='" << m << "', want ''"; + + RE2 plus("(?:|a)+"); + ASSERT_TRUE(plus.Match(s, 0, s.size(), RE2::UNANCHORED, &m, 1)); + ASSERT_EQ(m, "") << " got m='" << m << "', want ''"; +} + } // namespace re2 diff --git a/re2/testing/search_test.cc b/re2/testing/search_test.cc index c20f501da..5d86dbfa1 100644 --- a/re2/testing/search_test.cc +++ b/re2/testing/search_test.cc @@ -308,6 +308,8 @@ RegexpTest simple_tests[] = { // Former bugs. { "a\\C*|ba\\C", "baba" }, { "\\w*I\\w*", "Inc." }, + { "(?:|a)*", "aaa" }, + { "(?:|a)+", "aaa" }, }; TEST(Regexp, SearchTests) {