Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce simple string matcher #2433

Merged
merged 1 commit into from
Jan 20, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
257 changes: 257 additions & 0 deletions libbeat/common/match/cmp.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,257 @@
package match

import "regexp/syntax"

// common predefined patterns
var (
patDotStar = mustParse(`.*`)
patNullBeginDotStar = mustParse(`^.*`)
patNullEndDotStar = mustParse(`.*$`)

patEmptyText = mustParse(`^$`)
patEmptyWhiteText = mustParse(`^\s*$`)

// patterns matching any content
patAny1 = patDotStar
patAny2 = mustParse(`^.*`)
patAny3 = mustParse(`^.*$`)
patAny4 = mustParse(`.*$`)

patBeginText = mustParse(`^`)
patEndText = mustParse(`$`)

patDigits = mustParse(`\d`)
)

// isPrefixLiteral checks regular expression being literal checking string
// starting with literal pattern (like '^PATTERN')
func isPrefixLiteral(r *syntax.Regexp) bool {
return r.Op == syntax.OpConcat &&
len(r.Sub) == 2 &&
r.Sub[0].Op == syntax.OpBeginText &&
r.Sub[1].Op == syntax.OpLiteral
}

func isAltLiterals(r *syntax.Regexp) bool {
if r.Op != syntax.OpAlternate {
return false
}

for _, sub := range r.Sub {
if sub.Op != syntax.OpLiteral {
return false
}
}
return true
}

func isExactLiteral(r *syntax.Regexp) bool {
return r.Op == syntax.OpConcat &&
len(r.Sub) == 3 &&
r.Sub[0].Op == syntax.OpBeginText &&
r.Sub[1].Op == syntax.OpLiteral &&
r.Sub[2].Op == syntax.OpEndText
}

func isOneOfLiterals(r *syntax.Regexp) bool {
return r.Op == syntax.OpConcat &&
len(r.Sub) == 3 &&
r.Sub[0].Op == syntax.OpBeginText &&
isAltLiterals(r.Sub[1]) &&
r.Sub[2].Op == syntax.OpEndText
}

// isPrefixAltLiterals checks regular expression being alternative literals
// starting with literal pattern (like '^PATTERN')
func isPrefixAltLiterals(r *syntax.Regexp) bool {
isPrefixAlt := r.Op == syntax.OpConcat &&
len(r.Sub) == 2 &&
r.Sub[0].Op == syntax.OpBeginText &&
r.Sub[1].Op == syntax.OpAlternate
if !isPrefixAlt {
return false
}

for _, sub := range r.Sub[1].Sub {
if sub.Op != syntax.OpLiteral {
return false
}
}
return true
}

func isPrefixNumDate(r *syntax.Regexp) bool {
if r.Op != syntax.OpConcat || r.Sub[0].Op != syntax.OpBeginText {
return false
}

i := 1
if r.Sub[i].Op == syntax.OpLiteral {
i++
}

// check digits
if !isMultiDigits(r.Sub[i]) {
return false
}
i++

for i < len(r.Sub) {
// check separator
if r.Sub[i].Op != syntax.OpLiteral {
return false
}
i++

// check digits
if !isMultiDigits(r.Sub[i]) {
return false
}
i++
}

return true
}

// isdotStar checks the term being `.*`.
func isdotStar(r *syntax.Regexp) bool {
return eqRegex(r, patDotStar)
}

func isEmptyText(r *syntax.Regexp) bool {
return eqRegex(r, patEmptyText)
}

func isEmptyTextWithWhitespace(r *syntax.Regexp) bool {
return eqRegex(r, patEmptyWhiteText)
}

func isAnyMatch(r *syntax.Regexp) bool {
return eqRegex(r, patAny1) ||
eqRegex(r, patAny2) ||
eqRegex(r, patAny3) ||
eqRegex(r, patAny4)
}

func isDigitMatch(r *syntax.Regexp) bool {
return eqRegex(r, patDigits)
}

func isMultiDigits(r *syntax.Regexp) bool {
return isConcatRepetition(r) && isDigitMatch(r.Sub[0])
}

func isConcatRepetition(r *syntax.Regexp) bool {
if r.Op != syntax.OpConcat {
return false
}

first := r.Sub[0]
for _, other := range r.Sub {
if other != first { // concat repetitions reuse references => compare pointers
return false
}
}

return true
}

func eqRegex(r, proto *syntax.Regexp) bool {
unmatchable := r.Op != proto.Op || r.Flags != proto.Flags ||
(r.Min != proto.Min) || (r.Max != proto.Max) ||
(len(r.Sub) != len(proto.Sub)) ||
(len(r.Rune) != len(proto.Rune))

if unmatchable {
return false
}

for i := range r.Sub {
if !eqRegex(r.Sub[i], proto.Sub[i]) {
return false
}
}

for i := range r.Rune {
if r.Rune[i] != proto.Rune[i] {
return false
}
}
return true
}

func eqPrefixAnyRegex(r *syntax.Regexp, protos ...*syntax.Regexp) bool {
for _, proto := range protos {
if eqPrefixRegex(r, proto) {
return true
}
}
return false
}

func eqPrefixRegex(r, proto *syntax.Regexp) bool {
if r.Op != syntax.OpConcat {
return false
}

if proto.Op != syntax.OpConcat {
if len(r.Sub) == 0 {
return false
}
return eqRegex(r.Sub[0], proto)
}

if len(r.Sub) < len(proto.Sub) {
return false
}

for i := range proto.Sub {
if !eqRegex(r.Sub[i], proto.Sub[i]) {
return false
}
}
return true
}

func eqSuffixAnyRegex(r *syntax.Regexp, protos ...*syntax.Regexp) bool {
for _, proto := range protos {
if eqSuffixRegex(r, proto) {
return true
}
}
return false
}

func eqSuffixRegex(r, proto *syntax.Regexp) bool {
if r.Op != syntax.OpConcat {
return false
}

if proto.Op != syntax.OpConcat {
i := len(r.Sub) - 1
if i < 0 {
return false
}
return eqRegex(r.Sub[i], proto)
}

if len(r.Sub) < len(proto.Sub) {
return false
}

d := len(r.Sub) - len(proto.Sub)
for i := range proto.Sub {
if !eqRegex(r.Sub[d+i], proto.Sub[i]) {
return false
}
}
return true
}

func mustParse(pattern string) *syntax.Regexp {
r, err := syntax.Parse(pattern, syntax.Perl)
if err != nil {
panic(err)
}
return r
}
111 changes: 111 additions & 0 deletions libbeat/common/match/compile.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
package match

import (
"regexp"
"regexp/syntax"
)

func compile(r *syntax.Regexp) (stringMatcher, error) {
switch {
case r.Op == syntax.OpLiteral:
s := string(r.Rune)
return &substringMatcher{s, []byte(s)}, nil

case isExactLiteral(r):
s := string(r.Sub[1].Rune)
return &equalsMatcher{s, []byte(s)}, nil

case isAltLiterals(r):
var literals [][]byte
for _, sub := range r.Sub {
literals = append(literals, []byte(string(sub.Rune)))
}
return &altSubstringMatcher{literals}, nil

case isOneOfLiterals(r):
var literals [][]byte
for _, sub := range r.Sub[1].Sub {
literals = append(literals, []byte(string(sub.Rune)))
}
return &oneOfMatcher{literals}, nil

case isPrefixLiteral(r):
s := []byte(string(r.Sub[1].Rune))
return &prefixMatcher{s}, nil

case isPrefixAltLiterals(r):
var literals [][]byte
for _, sub := range r.Sub[1].Sub {
literals = append(literals, []byte(string(sub.Rune)))
}
return &altPrefixMatcher{literals}, nil

case isPrefixNumDate(r):
return compilePrefixNumDate(r)

case isEmptyText(r):
var m *emptyStringMatcher
return m, nil

case isEmptyTextWithWhitespace(r):
var m *emptyWhiteStringMatcher
return m, nil

case isAnyMatch(r):
var m *matchAny
return m, nil

default:

r, err := regexp.Compile(r.String())
if err != nil {
return nil, err
}
return r, nil
}
}

func compilePrefixNumDate(r *syntax.Regexp) (stringMatcher, error) {
m := &prefixNumDate{}

i := 1
if r.Sub[i].Op == syntax.OpLiteral {
m.prefix = []byte(string(r.Sub[i].Rune))
i++
}

digitLen := func(r *syntax.Regexp) int {
if r.Op == syntax.OpConcat {
return len(r.Sub)
}
return 1
}

var digits []int
var seps [][]byte

digits = append(digits, digitLen(r.Sub[i]))
i++

for i < len(r.Sub) {
seps = append(seps, []byte(string(r.Sub[i].Rune)))
i++

digits = append(digits, digitLen(r.Sub[i]))
i++
}

minLen := len(m.prefix)
for _, d := range digits {
minLen += d
}
for _, sep := range seps {
minLen += len(sep)
}

m.digits = digits
m.seps = seps
m.minLen = minLen

return m, nil
}
Loading