Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

remove regex based normalization #14

Merged
merged 4 commits into from
Oct 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 72 additions & 44 deletions normalizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@ type StatementMetadata struct {
Commands []string
}

type groupablePlaceholder struct {
groupable bool
}

type Normalizer struct {
config *normalizerConfig
}
Expand All @@ -76,11 +80,6 @@ func NewNormalizer(opts ...normalizerOption) *Normalizer {
return &normalizer
}

const (
ArrayPlaceholder = "( ? )"
BracketPlaceholder = "[ ? ]"
)

// Normalize takes an input SQL string and returns a normalized SQL string, a StatementMetadata struct, and an error.
// The normalizer collapses input SQL into compact format, groups obfuscated values into single placeholder,
// and collects metadata such as table names, comments, and commands.
Expand All @@ -99,26 +98,22 @@ func (n *Normalizer) Normalize(input string, lexerOpts ...lexerOption) (normaliz
}

var lastToken Token // The last token that is not whitespace or comment
var groupablePlaceholder groupablePlaceholder

for _, token := range lexer.ScanAll() {
n.collectMetadata(token, lastToken, statementMetadata)
lastToken = n.normalizeSQL(token, lastToken, &normalizedSQLBuilder)
n.collectMetadata(&token, &lastToken, statementMetadata)
n.normalizeSQL(&token, &lastToken, &normalizedSQLBuilder, &groupablePlaceholder)
}

normalizedSQL = normalizedSQLBuilder.String()

normalizedSQL = groupObfuscatedValues(normalizedSQL)
if !n.config.KeepSQLAlias {
normalizedSQL = discardSQLAlias(normalizedSQL)
}

// Dedupe collected metadata
dedupeStatementMetadata(statementMetadata)

return strings.TrimSpace(normalizedSQL), statementMetadata, nil
}

func (n *Normalizer) collectMetadata(token Token, lastToken Token, statementMetadata *StatementMetadata) {
func (n *Normalizer) collectMetadata(token *Token, lastToken *Token, statementMetadata *StatementMetadata) {
if n.config.CollectComments && (token.Type == COMMENT || token.Type == MULTILINE_COMMENT) {
// Collect comments
statementMetadata.Comments = append(statementMetadata.Comments, token.Value)
Expand All @@ -133,46 +128,79 @@ func (n *Normalizer) collectMetadata(token Token, lastToken Token, statementMeta
}
}

func (n *Normalizer) normalizeSQL(token Token, lastToken Token, normalizedSQLBuilder *strings.Builder) Token {
func (n *Normalizer) normalizeSQL(token *Token, lastToken *Token, normalizedSQLBuilder *strings.Builder, groupablePlaceholder *groupablePlaceholder) {
if token.Type != WS && token.Type != COMMENT && token.Type != MULTILINE_COMMENT {
if !n.config.KeepSQLAlias {
// discard SQL alias
if strings.ToUpper(token.Value) == "AS" {
// if current token is AS, then continue to next token
// because without seeing the next token, we cannot
// determine if the current token is an alias or not
*lastToken = *token
return
}

if strings.ToUpper(lastToken.Value) == "AS" {
if token.Type == IDENT {
// if the last token is AS and the current token is IDENT,
// then the current token is an alias, so we discard it
*lastToken = *token
return
} else {
// if the last token is AS and the current token is not IDENT,
// this could be a CTE like WITH ... AS (...),
// so we do not discard the current token
appendWhitespace(lastToken, token, normalizedSQLBuilder)
n.writeToken(lastToken, normalizedSQLBuilder)
}
}
}

// group consecutive obfuscated values into single placeholder
if n.isObfuscatedValueGroupable(token, lastToken, groupablePlaceholder) {
// return the token but not write it to the normalizedSQLBuilder
*lastToken = *token
return
}

// determine if we should add a whitespace
appendWhitespace(lastToken, token, normalizedSQLBuilder)
if n.config.UppercaseKeywords && isSQLKeyword(token) {
normalizedSQLBuilder.WriteString(strings.ToUpper(token.Value))
} else {
normalizedSQLBuilder.WriteString(token.Value)
}
n.writeToken(token, normalizedSQLBuilder)

lastToken = token
*lastToken = *token
}
}

return lastToken
func (n *Normalizer) writeToken(token *Token, normalizedSQLBuilder *strings.Builder) {
if n.config.UppercaseKeywords && isSQLKeyword(token) {
normalizedSQLBuilder.WriteString(strings.ToUpper(token.Value))
} else {
normalizedSQLBuilder.WriteString(token.Value)
}
}

// groupObfuscatedValues groups consecutive obfuscated values in a SQL query into a single placeholder.
// It replaces "(?, ?, ...)" and "[?, ?, ...]" with "( ? )" and "[ ? ]", respectively.
// Returns the modified SQL query as a string.
func groupObfuscatedValues(input string) string {
// We use regex to group consecutive obfuscated values into single placeholder.
// This is "less" performant than token by token processing,
// but it is much simpler to implement and maintain.
// The trade off made here is assuming normalization runs on backend
// where performance is not as critical as the agent.
grouped := groupableRegex.ReplaceAllStringFunc(input, func(match string) string {
if match[0] == '(' {
return ArrayPlaceholder
func (n *Normalizer) isObfuscatedValueGroupable(token *Token, lastToken *Token, groupablePlaceholder *groupablePlaceholder) bool {
if token.Value == NumberPlaceholder || token.Value == StringPlaceholder {
if lastToken.Value == "(" || lastToken.Value == "[" {
// if the last token is "(" or "[", and the current token is a placeholder,
// we know it's the start of groupable placeholders
// we don't return here because we still need to write the first placeholder
groupablePlaceholder.groupable = true
} else if lastToken.Value == "," && groupablePlaceholder.groupable {
return true
}
return BracketPlaceholder
})
return grouped
}
}

if (lastToken.Value == NumberPlaceholder || lastToken.Value == StringPlaceholder) && token.Value == "," && groupablePlaceholder.groupable {
return true
}

if groupablePlaceholder.groupable && (token.Value == ")" || token.Value == "]") {
// end of groupable placeholders
groupablePlaceholder.groupable = false
}

// discardSQLAlias removes any SQL alias from the input string and returns the modified string.
// It uses a regular expression to match the alias pattern and replace it with an empty string.
// The function is case-insensitive and matches the pattern "AS <alias_name>".
// The input string is not modified in place.
func discardSQLAlias(input string) string {
return sqlAliasRegex.ReplaceAllString(input, "")
return false
}

func dedupeCollectedMetadata(metadata []string) (dedupedMetadata []string, size int) {
Expand All @@ -198,7 +226,7 @@ func dedupeStatementMetadata(info *StatementMetadata) {
info.Size += tablesSize + commentsSize + commandsSize
}

func appendWhitespace(lastToken Token, token Token, normalizedSQLBuilder *strings.Builder) {
func appendWhitespace(lastToken *Token, token *Token, normalizedSQLBuilder *strings.Builder) {
switch token.Value {
case ",":
case "=":
Expand Down
11 changes: 10 additions & 1 deletion normalizer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -743,11 +743,20 @@ func TestGroupObfuscatedValues(t *testing.T) {
input: "[ ?,?]",
expected: "[ ? ]",
},
{
input: "ANY(?)",
expected: "ANY ( ? )",
},
{
input: "ANY(?, ?)",
expected: "ANY ( ? )",
},
}

for _, test := range tests {
t.Run("", func(t *testing.T) {
got := groupObfuscatedValues(test.input)
normalizer := NewNormalizer()
got, _, _ := normalizer.Normalize(test.input)
assert.Equal(t, test.expected, got)
})
}
Expand Down
12 changes: 4 additions & 8 deletions obfuscate_and_normalize.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,20 +19,16 @@ func ObfuscateAndNormalize(input string, obfuscator *Obfuscator, normalizer *Nor
}

var lastToken Token // The last token that is not whitespace or comment
var groupablePlaceholder groupablePlaceholder

for _, token := range lexer.ScanAll() {
obfuscatedToken := Token{Type: token.Type, Value: obfuscator.ObfuscateTokenValue(token, lexerOpts...)}
normalizer.collectMetadata(obfuscatedToken, lastToken, statementMetadata)
lastToken = normalizer.normalizeSQL(obfuscatedToken, lastToken, &normalizedSQLBuilder)
token.Value = obfuscator.ObfuscateTokenValue(token, lexerOpts...)
normalizer.collectMetadata(&token, &lastToken, statementMetadata)
normalizer.normalizeSQL(&token, &lastToken, &normalizedSQLBuilder, &groupablePlaceholder)
}

normalizedSQL = normalizedSQLBuilder.String()

normalizedSQL = groupObfuscatedValues(normalizedSQL)
if !normalizer.config.KeepSQLAlias {
normalizedSQL = discardSQLAlias(normalizedSQL)
}

// Dedupe collected metadata
dedupeStatementMetadata(statementMetadata)

Expand Down
2 changes: 1 addition & 1 deletion obfuscator.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ func (o *Obfuscator) ObfuscateTokenValue(token Token, lexerOpts ...lexerOption)
}

if o.config.ReplaceDigits {
return replaceDigits(token.Value, "?")
return replaceDigits(token.Value, NumberPlaceholder)
} else {
return token.Value
}
Expand Down
108 changes: 98 additions & 10 deletions sqllexer_utils.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package sqllexer

import (
"regexp"
"strings"
"unicode"
)
Expand All @@ -19,7 +18,7 @@ const (
DBMSOracle DBMSType = "oracle"
)

var Commands = map[string]bool{
var commands = map[string]bool{
"SELECT": true,
"INSERT": true,
"UPDATE": true,
Expand Down Expand Up @@ -47,11 +46,96 @@ var tableIndicators = map[string]bool{
"TABLE": true,
}

var keywordsRegex = regexp.MustCompile(`(?i)^(SELECT|INSERT|UPDATE|DELETE|CREATE|ALTER|DROP|GRANT|REVOKE|ADD|ALL|AND|ANY|AS|ASC|BEGIN|BETWEEN|BY|CASE|CHECK|COLUMN|COMMIT|CONSTRAINT|DATABASE|DECLARE|DEFAULT|DESC|DISTINCT|ELSE|END|EXEC|EXISTS|FOREIGN|FROM|GROUP|HAVING|IN|INDEX|INNER|INTO|IS|JOIN|KEY|LEFT|LIKE|LIMIT|NOT|ON|OR|ORDER|OUTER|PRIMARY|PROCEDURE|REPLACE|RETURNS|RIGHT|ROLLBACK|ROWNUM|SET|SOME|TABLE|TOP|TRUNCATE|UNION|UNIQUE|USE|VALUES|VIEW|WHERE|CUBE|ROLLUP|LITERAL|WINDOW|VACCUM|ANALYZE|ILIKE|USING|ASSERTION|DOMAIN|CLUSTER|COPY|EXPLAIN|PLPGSQL|TRIGGER|TEMPORARY|UNLOGGED|RECURSIVE|RETURNING)$`)

var groupableRegex = regexp.MustCompile(`(\()\s*\?(?:\s*,\s*\?\s*)*\s*(\))|(\[)\s*\?(?:\s*,\s*\?\s*)*\s*(\])`)

var sqlAliasRegex = regexp.MustCompile(`(?i)\s+AS\s+[\w?]+`)
var keywords = map[string]bool{
"SELECT": true,
"INSERT": true,
"UPDATE": true,
"DELETE": true,
"CREATE": true,
"ALTER": true,
"DROP": true,
"GRANT": true,
"REVOKE": true,
"ADD": true,
"ALL": true,
"AND": true,
"ANY": true,
"AS": true,
"ASC": true,
"BEGIN": true,
"BETWEEN": true,
"BY": true,
"CASE": true,
"CHECK": true,
"COLUMN": true,
"COMMIT": true,
"CONSTRAINT": true,
"DATABASE": true,
"DECLARE": true,
"DEFAULT": true,
"DESC": true,
"DISTINCT": true,
"ELSE": true,
"END": true,
"EXEC": true,
"EXISTS": true,
"FOREIGN": true,
"FROM": true,
"GROUP": true,
"HAVING": true,
"IN": true,
"INDEX": true,
"INNER": true,
"INTO": true,
"IS": true,
"JOIN": true,
"KEY": true,
"LEFT": true,
"LIKE": true,
"LIMIT": true,
"NOT": true,
"ON": true,
"OR": true,
"ORDER": true,
"OUTER": true,
"PRIMARY": true,
"PROCEDURE": true,
"REPLACE": true,
"RETURNS": true,
"RIGHT": true,
"ROLLBACK": true,
"ROWNUM": true,
"SET": true,
"SOME": true,
"TABLE": true,
"TOP": true,
"TRUNCATE": true,
"UNION": true,
"UNIQUE": true,
"USE": true,
"VALUES": true,
"VIEW": true,
"WHERE": true,
"CUBE": true,
"ROLLUP": true,
"LITERAL": true,
"WINDOW": true,
"VACCUM": true,
"ANALYZE": true,
"ILIKE": true,
"USING": true,
"ASSERTION": true,
"DOMAIN": true,
"CLUSTER": true,
"COPY": true,
"EXPLAIN": true,
"PLPGSQL": true,
"TRIGGER": true,
"TEMPORARY": true,
"UNLOGGED": true,
"RECURSIVE": true,
"RETURNING": true,
}

func isWhitespace(ch rune) bool {
return ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r'
Expand Down Expand Up @@ -106,7 +190,7 @@ func isEOF(ch rune) bool {
}

func isCommand(ident string) bool {
_, ok := Commands[ident]
_, ok := commands[ident]
return ok
}

Expand All @@ -115,8 +199,12 @@ func isTableIndicator(ident string) bool {
return ok
}

func isSQLKeyword(token Token) bool {
return token.Type == IDENT && keywordsRegex.MatchString(token.Value)
func isSQLKeyword(token *Token) bool {
if token.Type != IDENT {
return false
}
_, ok := keywords[strings.ToUpper(token.Value)]
return ok
}

func isBoolean(ident string) bool {
Expand Down