-
Notifications
You must be signed in to change notification settings - Fork 4.9k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Provide match.Matcher and match.ExactMatcher using regular expressions for matching use-case only. The matchers compile a regular expression into a Matcher, which only provides the Match functionality. This gives us a chance to optimize/replace some common cases used for matching: - replace capture-groups by non-capturing groups - remove leading/trailing `.*` expressions (Match already searches for sub-string matching the regex) - replace simple literal searches with `==` and `strings.Contains` and `strings.startsWith` - replace regex for alternative literals (e.g. `DEBUG|INFO|ERROR`) with strings.Contains over set of literals - optimized empty-lines checks If input regular expression can not be matched to a simple case, regexp.Regexp will be used. The `ExactMatcher` will embedd `<regex>` into `^<regex>$` by default. Note: Matcher does currently not split simple cases. e.g. `abc.*def` or `abc.def` will still fallback to regexp.Regexp.
- Loading branch information
urso
committed
Jan 12, 2017
1 parent
19e82d5
commit c683077
Showing
7 changed files
with
1,411 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,257 @@ | ||
package match | ||
|
||
import "regexp/syntax" | ||
|
||
// common predefined patterns | ||
var ( | ||
patDotStar = mustParse(`.*`) | ||
patNullBeginDotStar = mustParse(`^.*`) | ||
patNullEndDotStar = mustParse(`.*$`) | ||
|
||
patEmptyText = mustParse(`^$`) | ||
patEmptyWhiteText = mustParse(`^\s*$`) | ||
|
||
// patterns matching any content | ||
patAny1 = patDotStar | ||
patAny2 = mustParse(`^.*`) | ||
patAny3 = mustParse(`^.*$`) | ||
patAny4 = mustParse(`.*$`) | ||
|
||
patBeginText = mustParse(`^`) | ||
patEndText = mustParse(`$`) | ||
|
||
patDigits = mustParse(`\d`) | ||
) | ||
|
||
// isPrefixLiteral checks regular expression being literal checking string | ||
// starting with literal pattern (like '^PATTERN') | ||
func isPrefixLiteral(r *syntax.Regexp) bool { | ||
return r.Op == syntax.OpConcat && | ||
len(r.Sub) == 2 && | ||
r.Sub[0].Op == syntax.OpBeginText && | ||
r.Sub[1].Op == syntax.OpLiteral | ||
} | ||
|
||
func isAltLiterals(r *syntax.Regexp) bool { | ||
if r.Op != syntax.OpAlternate { | ||
return false | ||
} | ||
|
||
for _, sub := range r.Sub { | ||
if sub.Op != syntax.OpLiteral { | ||
return false | ||
} | ||
} | ||
return true | ||
} | ||
|
||
func isExactLiteral(r *syntax.Regexp) bool { | ||
return r.Op == syntax.OpConcat && | ||
len(r.Sub) == 3 && | ||
r.Sub[0].Op == syntax.OpBeginText && | ||
r.Sub[1].Op == syntax.OpLiteral && | ||
r.Sub[2].Op == syntax.OpEndText | ||
} | ||
|
||
func isOneOfLiterals(r *syntax.Regexp) bool { | ||
return r.Op == syntax.OpConcat && | ||
len(r.Sub) == 3 && | ||
r.Sub[0].Op == syntax.OpBeginText && | ||
isAltLiterals(r.Sub[1]) && | ||
r.Sub[2].Op == syntax.OpEndText | ||
} | ||
|
||
// isPrefixAltLiterals checks regular expression being alternative literals | ||
// starting with literal pattern (like '^PATTERN') | ||
func isPrefixAltLiterals(r *syntax.Regexp) bool { | ||
isPrefixAlt := r.Op == syntax.OpConcat && | ||
len(r.Sub) == 2 && | ||
r.Sub[0].Op == syntax.OpBeginText && | ||
r.Sub[1].Op == syntax.OpAlternate | ||
if !isPrefixAlt { | ||
return false | ||
} | ||
|
||
for _, sub := range r.Sub[1].Sub { | ||
if sub.Op != syntax.OpLiteral { | ||
return false | ||
} | ||
} | ||
return true | ||
} | ||
|
||
func isPrefixNumDate(r *syntax.Regexp) bool { | ||
if r.Op != syntax.OpConcat || r.Sub[0].Op != syntax.OpBeginText { | ||
return false | ||
} | ||
|
||
i := 1 | ||
if r.Sub[i].Op == syntax.OpLiteral { | ||
i++ | ||
} | ||
|
||
// check digits | ||
if !isMultiDigits(r.Sub[i]) { | ||
return false | ||
} | ||
i++ | ||
|
||
for i < len(r.Sub) { | ||
// check separator | ||
if r.Sub[i].Op != syntax.OpLiteral { | ||
return false | ||
} | ||
i++ | ||
|
||
// check digits | ||
if !isMultiDigits(r.Sub[i]) { | ||
return false | ||
} | ||
i++ | ||
} | ||
|
||
return true | ||
} | ||
|
||
// isdotStar checks the term being `.*`. | ||
func isdotStar(r *syntax.Regexp) bool { | ||
return eqRegex(r, patDotStar) | ||
} | ||
|
||
func isEmptyText(r *syntax.Regexp) bool { | ||
return eqRegex(r, patEmptyText) | ||
} | ||
|
||
func isEmptyTextWithWhitespace(r *syntax.Regexp) bool { | ||
return eqRegex(r, patEmptyWhiteText) | ||
} | ||
|
||
func isAnyMatch(r *syntax.Regexp) bool { | ||
return eqRegex(r, patAny1) || | ||
eqRegex(r, patAny2) || | ||
eqRegex(r, patAny3) || | ||
eqRegex(r, patAny4) | ||
} | ||
|
||
func isDigitMatch(r *syntax.Regexp) bool { | ||
return eqRegex(r, patDigits) | ||
} | ||
|
||
func isMultiDigits(r *syntax.Regexp) bool { | ||
return isConcatRepetition(r) && isDigitMatch(r.Sub[0]) | ||
} | ||
|
||
func isConcatRepetition(r *syntax.Regexp) bool { | ||
if r.Op != syntax.OpConcat { | ||
return false | ||
} | ||
|
||
first := r.Sub[0] | ||
for _, other := range r.Sub { | ||
if other != first { // concat repetitions reuse references => compare pointers | ||
return false | ||
} | ||
} | ||
|
||
return true | ||
} | ||
|
||
func eqRegex(r, proto *syntax.Regexp) bool { | ||
unmatchable := r.Op != proto.Op || r.Flags != proto.Flags || | ||
(r.Min != proto.Min) || (r.Max != proto.Max) || | ||
(len(r.Sub) != len(proto.Sub)) || | ||
(len(r.Rune) != len(proto.Rune)) | ||
|
||
if unmatchable { | ||
return false | ||
} | ||
|
||
for i := range r.Sub { | ||
if !eqRegex(r.Sub[i], proto.Sub[i]) { | ||
return false | ||
} | ||
} | ||
|
||
for i := range r.Rune { | ||
if r.Rune[i] != proto.Rune[i] { | ||
return false | ||
} | ||
} | ||
return true | ||
} | ||
|
||
func eqPrefixAnyRegex(r *syntax.Regexp, protos ...*syntax.Regexp) bool { | ||
for _, proto := range protos { | ||
if eqPrefixRegex(r, proto) { | ||
return true | ||
} | ||
} | ||
return false | ||
} | ||
|
||
func eqPrefixRegex(r, proto *syntax.Regexp) bool { | ||
if r.Op != syntax.OpConcat { | ||
return false | ||
} | ||
|
||
if proto.Op != syntax.OpConcat { | ||
if len(r.Sub) == 0 { | ||
return false | ||
} | ||
return eqRegex(r.Sub[0], proto) | ||
} | ||
|
||
if len(r.Sub) < len(proto.Sub) { | ||
return false | ||
} | ||
|
||
for i := range proto.Sub { | ||
if !eqRegex(r.Sub[i], proto.Sub[i]) { | ||
return false | ||
} | ||
} | ||
return true | ||
} | ||
|
||
func eqSuffixAnyRegex(r *syntax.Regexp, protos ...*syntax.Regexp) bool { | ||
for _, proto := range protos { | ||
if eqSuffixRegex(r, proto) { | ||
return true | ||
} | ||
} | ||
return false | ||
} | ||
|
||
func eqSuffixRegex(r, proto *syntax.Regexp) bool { | ||
if r.Op != syntax.OpConcat { | ||
return false | ||
} | ||
|
||
if proto.Op != syntax.OpConcat { | ||
i := len(r.Sub) - 1 | ||
if i < 0 { | ||
return false | ||
} | ||
return eqRegex(r.Sub[i], proto) | ||
} | ||
|
||
if len(r.Sub) < len(proto.Sub) { | ||
return false | ||
} | ||
|
||
d := len(r.Sub) - len(proto.Sub) | ||
for i := range proto.Sub { | ||
if !eqRegex(r.Sub[d+i], proto.Sub[i]) { | ||
return false | ||
} | ||
} | ||
return true | ||
} | ||
|
||
func mustParse(pattern string) *syntax.Regexp { | ||
r, err := syntax.Parse(pattern, syntax.Perl) | ||
if err != nil { | ||
panic(err) | ||
} | ||
return r | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
package match | ||
|
||
import ( | ||
"regexp" | ||
"regexp/syntax" | ||
) | ||
|
||
func compile(r *syntax.Regexp) (stringMatcher, error) { | ||
switch { | ||
case r.Op == syntax.OpLiteral: | ||
s := string(r.Rune) | ||
return &substringMatcher{s, []byte(s)}, nil | ||
|
||
case isExactLiteral(r): | ||
s := string(r.Sub[1].Rune) | ||
return &equalsMatcher{s, []byte(s)}, nil | ||
|
||
case isAltLiterals(r): | ||
var literals [][]byte | ||
for _, sub := range r.Sub { | ||
literals = append(literals, []byte(string(sub.Rune))) | ||
} | ||
return &altSubstringMatcher{literals}, nil | ||
|
||
case isOneOfLiterals(r): | ||
var literals [][]byte | ||
for _, sub := range r.Sub[1].Sub { | ||
literals = append(literals, []byte(string(sub.Rune))) | ||
} | ||
return &oneOfMatcher{literals}, nil | ||
|
||
case isPrefixLiteral(r): | ||
s := []byte(string(r.Sub[1].Rune)) | ||
return &prefixMatcher{s}, nil | ||
|
||
case isPrefixAltLiterals(r): | ||
var literals [][]byte | ||
for _, sub := range r.Sub[1].Sub { | ||
literals = append(literals, []byte(string(sub.Rune))) | ||
} | ||
return &altPrefixMatcher{literals}, nil | ||
|
||
case isPrefixNumDate(r): | ||
return compilePrefixNumDate(r) | ||
|
||
case isEmptyText(r): | ||
var m *emptyStringMatcher | ||
return m, nil | ||
|
||
case isEmptyTextWithWhitespace(r): | ||
var m *emptyWhiteStringMatcher | ||
return m, nil | ||
|
||
case isAnyMatch(r): | ||
var m *matchAny | ||
return m, nil | ||
|
||
default: | ||
|
||
r, err := regexp.Compile(r.String()) | ||
if err != nil { | ||
return nil, err | ||
} | ||
return r, nil | ||
} | ||
} | ||
|
||
func compilePrefixNumDate(r *syntax.Regexp) (stringMatcher, error) { | ||
m := &prefixNumDate{} | ||
|
||
i := 1 | ||
if r.Sub[i].Op == syntax.OpLiteral { | ||
m.prefix = []byte(string(r.Sub[i].Rune)) | ||
i++ | ||
} | ||
|
||
digitLen := func(r *syntax.Regexp) int { | ||
if r.Op == syntax.OpConcat { | ||
return len(r.Sub) | ||
} | ||
return 1 | ||
} | ||
|
||
var digits []int | ||
var seps [][]byte | ||
|
||
digits = append(digits, digitLen(r.Sub[i])) | ||
i++ | ||
|
||
for i < len(r.Sub) { | ||
seps = append(seps, []byte(string(r.Sub[i].Rune))) | ||
i++ | ||
|
||
digits = append(digits, digitLen(r.Sub[i])) | ||
i++ | ||
} | ||
|
||
minLen := len(m.prefix) | ||
for _, d := range digits { | ||
minLen += d | ||
} | ||
for _, sep := range seps { | ||
minLen += len(sep) | ||
} | ||
|
||
m.digits = digits | ||
m.seps = seps | ||
m.minLen = minLen | ||
|
||
return m, nil | ||
} |
Oops, something went wrong.