From 708b62d763bd193faf2c59da96c3cf0f73e0a52c Mon Sep 17 00:00:00 2001 From: Zhengda Lu Date: Fri, 13 Oct 2023 16:14:53 -0400 Subject: [PATCH 1/4] remove regex based normalization --- normalizer.go | 104 ++++++++++++++++++++--------------- normalizer_test.go | 3 +- obfuscate_and_normalize.go | 12 ++--- obfuscator.go | 2 +- sqllexer_utils.go | 108 +++++++++++++++++++++++++++++++++---- 5 files changed, 165 insertions(+), 64 deletions(-) diff --git a/normalizer.go b/normalizer.go index ea6baab..47b562e 100644 --- a/normalizer.go +++ b/normalizer.go @@ -60,6 +60,10 @@ type StatementMetadata struct { Commands []string } +type GroupablePlaceholder struct { + groupable bool +} + type Normalizer struct { config *normalizerConfig } @@ -76,11 +80,6 @@ func NewNormalizer(opts ...normalizerOption) *Normalizer { return &normalizer } -const ( - ArrayPlaceholder = "( ? )" - BracketPlaceholder = "[ ? ]" -) - // Normalize takes an input SQL string and returns a normalized SQL string, a StatementMetadata struct, and an error. // The normalizer collapses input SQL into compact format, groups obfuscated values into single placeholder, // and collects metadata such as table names, comments, and commands. @@ -99,26 +98,22 @@ func (n *Normalizer) Normalize(input string, lexerOpts ...lexerOption) (normaliz } var lastToken Token // The last token that is not whitespace or comment + var groupablePlaceholder GroupablePlaceholder for _, token := range lexer.ScanAll() { - n.collectMetadata(token, lastToken, statementMetadata) - lastToken = n.normalizeSQL(token, lastToken, &normalizedSQLBuilder) + n.collectMetadata(&token, &lastToken, statementMetadata) + n.normalizeSQL(&token, &lastToken, &normalizedSQLBuilder, &groupablePlaceholder) } normalizedSQL = normalizedSQLBuilder.String() - normalizedSQL = groupObfuscatedValues(normalizedSQL) - if !n.config.KeepSQLAlias { - normalizedSQL = discardSQLAlias(normalizedSQL) - } - // Dedupe collected metadata dedupeStatementMetadata(statementMetadata) return strings.TrimSpace(normalizedSQL), statementMetadata, nil } -func (n *Normalizer) collectMetadata(token Token, lastToken Token, statementMetadata *StatementMetadata) { +func (n *Normalizer) collectMetadata(token *Token, lastToken *Token, statementMetadata *StatementMetadata) { if n.config.CollectComments && (token.Type == COMMENT || token.Type == MULTILINE_COMMENT) { // Collect comments statementMetadata.Comments = append(statementMetadata.Comments, token.Value) @@ -133,46 +128,67 @@ func (n *Normalizer) collectMetadata(token Token, lastToken Token, statementMeta } } -func (n *Normalizer) normalizeSQL(token Token, lastToken Token, normalizedSQLBuilder *strings.Builder) Token { +func (n *Normalizer) normalizeSQL(token *Token, lastToken *Token, normalizedSQLBuilder *strings.Builder, groupablePlaceholder *GroupablePlaceholder) { if token.Type != WS && token.Type != COMMENT && token.Type != MULTILINE_COMMENT { + if !n.config.KeepSQLAlias { + // discard SQL alias + if strings.ToUpper(token.Value) == "AS" { + *lastToken = *token + return + } + + if strings.ToUpper(lastToken.Value) == "AS" { + if token.Type == IDENT { + *lastToken = *token + return + } else { + appendWhitespace(lastToken, token, normalizedSQLBuilder) + n.writeToken(lastToken, normalizedSQLBuilder) + } + } + } + + // group consecutive obfuscated values into single placeholder + if n.isObfuscatedValueGroupable(token, lastToken, groupablePlaceholder) { + // return the token but not write it to the normalizedSQLBuilder + *lastToken = *token + return + } + // determine if we should add a whitespace appendWhitespace(lastToken, token, normalizedSQLBuilder) - if n.config.UppercaseKeywords && isSQLKeyword(token) { - normalizedSQLBuilder.WriteString(strings.ToUpper(token.Value)) - } else { - normalizedSQLBuilder.WriteString(token.Value) - } + n.writeToken(token, normalizedSQLBuilder) - lastToken = token + *lastToken = *token } +} - return lastToken +func (n *Normalizer) writeToken(token *Token, normalizedSQLBuilder *strings.Builder) { + if n.config.UppercaseKeywords && isSQLKeyword(token) { + normalizedSQLBuilder.WriteString(strings.ToUpper(token.Value)) + } else { + normalizedSQLBuilder.WriteString(token.Value) + } } -// groupObfuscatedValues groups consecutive obfuscated values in a SQL query into a single placeholder. -// It replaces "(?, ?, ...)" and "[?, ?, ...]" with "( ? )" and "[ ? ]", respectively. -// Returns the modified SQL query as a string. -func groupObfuscatedValues(input string) string { - // We use regex to group consecutive obfuscated values into single placeholder. - // This is "less" performant than token by token processing, - // but it is much simpler to implement and maintain. - // The trade off made here is assuming normalization runs on backend - // where performance is not as critical as the agent. - grouped := groupableRegex.ReplaceAllStringFunc(input, func(match string) string { - if match[0] == '(' { - return ArrayPlaceholder +func (n *Normalizer) isObfuscatedValueGroupable(token *Token, lastToken *Token, groupablePlaceholder *GroupablePlaceholder) bool { + if token.Value == NumberPlaceholder || token.Value == StringPlaceholder { + if lastToken.Value == "(" || lastToken.Value == "[" { + groupablePlaceholder.groupable = true + } else if lastToken.Value == "," && groupablePlaceholder.groupable { + return true } - return BracketPlaceholder - }) - return grouped -} + } + + if (lastToken.Value == NumberPlaceholder || lastToken.Value == StringPlaceholder) && token.Value == "," && groupablePlaceholder.groupable { + return true + } + + if groupablePlaceholder.groupable && (token.Value == ")" || token.Value == "]") { + groupablePlaceholder.groupable = false + } -// discardSQLAlias removes any SQL alias from the input string and returns the modified string. -// It uses a regular expression to match the alias pattern and replace it with an empty string. -// The function is case-insensitive and matches the pattern "AS ". -// The input string is not modified in place. -func discardSQLAlias(input string) string { - return sqlAliasRegex.ReplaceAllString(input, "") + return false } func dedupeCollectedMetadata(metadata []string) (dedupedMetadata []string, size int) { @@ -198,7 +214,7 @@ func dedupeStatementMetadata(info *StatementMetadata) { info.Size += tablesSize + commentsSize + commandsSize } -func appendWhitespace(lastToken Token, token Token, normalizedSQLBuilder *strings.Builder) { +func appendWhitespace(lastToken *Token, token *Token, normalizedSQLBuilder *strings.Builder) { switch token.Value { case ",": case "=": diff --git a/normalizer_test.go b/normalizer_test.go index 3740769..695da32 100644 --- a/normalizer_test.go +++ b/normalizer_test.go @@ -747,7 +747,8 @@ func TestGroupObfuscatedValues(t *testing.T) { for _, test := range tests { t.Run("", func(t *testing.T) { - got := groupObfuscatedValues(test.input) + normalizer := NewNormalizer() + got, _, _ := normalizer.Normalize(test.input) assert.Equal(t, test.expected, got) }) } diff --git a/obfuscate_and_normalize.go b/obfuscate_and_normalize.go index 316d764..cc54860 100644 --- a/obfuscate_and_normalize.go +++ b/obfuscate_and_normalize.go @@ -19,20 +19,16 @@ func ObfuscateAndNormalize(input string, obfuscator *Obfuscator, normalizer *Nor } var lastToken Token // The last token that is not whitespace or comment + var groupablePlaceholder GroupablePlaceholder for _, token := range lexer.ScanAll() { - obfuscatedToken := Token{Type: token.Type, Value: obfuscator.ObfuscateTokenValue(token, lexerOpts...)} - normalizer.collectMetadata(obfuscatedToken, lastToken, statementMetadata) - lastToken = normalizer.normalizeSQL(obfuscatedToken, lastToken, &normalizedSQLBuilder) + token.Value = obfuscator.ObfuscateTokenValue(token, lexerOpts...) + normalizer.collectMetadata(&token, &lastToken, statementMetadata) + normalizer.normalizeSQL(&token, &lastToken, &normalizedSQLBuilder, &groupablePlaceholder) } normalizedSQL = normalizedSQLBuilder.String() - normalizedSQL = groupObfuscatedValues(normalizedSQL) - if !normalizer.config.KeepSQLAlias { - normalizedSQL = discardSQLAlias(normalizedSQL) - } - // Dedupe collected metadata dedupeStatementMetadata(statementMetadata) diff --git a/obfuscator.go b/obfuscator.go index ee0e658..daab528 100644 --- a/obfuscator.go +++ b/obfuscator.go @@ -114,7 +114,7 @@ func (o *Obfuscator) ObfuscateTokenValue(token Token, lexerOpts ...lexerOption) } if o.config.ReplaceDigits { - return replaceDigits(token.Value, "?") + return replaceDigits(token.Value, NumberPlaceholder) } else { return token.Value } diff --git a/sqllexer_utils.go b/sqllexer_utils.go index e57de87..e14502c 100644 --- a/sqllexer_utils.go +++ b/sqllexer_utils.go @@ -1,7 +1,6 @@ package sqllexer import ( - "regexp" "strings" "unicode" ) @@ -19,7 +18,7 @@ const ( DBMSOracle DBMSType = "oracle" ) -var Commands = map[string]bool{ +var commands = map[string]bool{ "SELECT": true, "INSERT": true, "UPDATE": true, @@ -47,11 +46,96 @@ var tableIndicators = map[string]bool{ "TABLE": true, } -var keywordsRegex = regexp.MustCompile(`(?i)^(SELECT|INSERT|UPDATE|DELETE|CREATE|ALTER|DROP|GRANT|REVOKE|ADD|ALL|AND|ANY|AS|ASC|BEGIN|BETWEEN|BY|CASE|CHECK|COLUMN|COMMIT|CONSTRAINT|DATABASE|DECLARE|DEFAULT|DESC|DISTINCT|ELSE|END|EXEC|EXISTS|FOREIGN|FROM|GROUP|HAVING|IN|INDEX|INNER|INTO|IS|JOIN|KEY|LEFT|LIKE|LIMIT|NOT|ON|OR|ORDER|OUTER|PRIMARY|PROCEDURE|REPLACE|RETURNS|RIGHT|ROLLBACK|ROWNUM|SET|SOME|TABLE|TOP|TRUNCATE|UNION|UNIQUE|USE|VALUES|VIEW|WHERE|CUBE|ROLLUP|LITERAL|WINDOW|VACCUM|ANALYZE|ILIKE|USING|ASSERTION|DOMAIN|CLUSTER|COPY|EXPLAIN|PLPGSQL|TRIGGER|TEMPORARY|UNLOGGED|RECURSIVE|RETURNING)$`) - -var groupableRegex = regexp.MustCompile(`(\()\s*\?(?:\s*,\s*\?\s*)*\s*(\))|(\[)\s*\?(?:\s*,\s*\?\s*)*\s*(\])`) - -var sqlAliasRegex = regexp.MustCompile(`(?i)\s+AS\s+[\w?]+`) +var keywords = map[string]bool{ + "SELECT": true, + "INSERT": true, + "UPDATE": true, + "DELETE": true, + "CREATE": true, + "ALTER": true, + "DROP": true, + "GRANT": true, + "REVOKE": true, + "ADD": true, + "ALL": true, + "AND": true, + "ANY": true, + "AS": true, + "ASC": true, + "BEGIN": true, + "BETWEEN": true, + "BY": true, + "CASE": true, + "CHECK": true, + "COLUMN": true, + "COMMIT": true, + "CONSTRAINT": true, + "DATABASE": true, + "DECLARE": true, + "DEFAULT": true, + "DESC": true, + "DISTINCT": true, + "ELSE": true, + "END": true, + "EXEC": true, + "EXISTS": true, + "FOREIGN": true, + "FROM": true, + "GROUP": true, + "HAVING": true, + "IN": true, + "INDEX": true, + "INNER": true, + "INTO": true, + "IS": true, + "JOIN": true, + "KEY": true, + "LEFT": true, + "LIKE": true, + "LIMIT": true, + "NOT": true, + "ON": true, + "OR": true, + "ORDER": true, + "OUTER": true, + "PRIMARY": true, + "PROCEDURE": true, + "REPLACE": true, + "RETURNS": true, + "RIGHT": true, + "ROLLBACK": true, + "ROWNUM": true, + "SET": true, + "SOME": true, + "TABLE": true, + "TOP": true, + "TRUNCATE": true, + "UNION": true, + "UNIQUE": true, + "USE": true, + "VALUES": true, + "VIEW": true, + "WHERE": true, + "CUBE": true, + "ROLLUP": true, + "LITERAL": true, + "WINDOW": true, + "VACCUM": true, + "ANALYZE": true, + "ILIKE": true, + "USING": true, + "ASSERTION": true, + "DOMAIN": true, + "CLUSTER": true, + "COPY": true, + "EXPLAIN": true, + "PLPGSQL": true, + "TRIGGER": true, + "TEMPORARY": true, + "UNLOGGED": true, + "RECURSIVE": true, + "RETURNING": true, +} func isWhitespace(ch rune) bool { return ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' @@ -106,7 +190,7 @@ func isEOF(ch rune) bool { } func isCommand(ident string) bool { - _, ok := Commands[ident] + _, ok := commands[ident] return ok } @@ -115,8 +199,12 @@ func isTableIndicator(ident string) bool { return ok } -func isSQLKeyword(token Token) bool { - return token.Type == IDENT && keywordsRegex.MatchString(token.Value) +func isSQLKeyword(token *Token) bool { + if token.Type != IDENT { + return false + } + _, ok := keywords[strings.ToUpper(token.Value)] + return ok } func isBoolean(ident string) bool { From 7e63af8d046cf6dac3850956e52b68c124ecebdb Mon Sep 17 00:00:00 2001 From: Zhengda Lu Date: Fri, 13 Oct 2023 16:24:52 -0400 Subject: [PATCH 2/4] add comments --- normalizer.go | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/normalizer.go b/normalizer.go index 47b562e..a2e805e 100644 --- a/normalizer.go +++ b/normalizer.go @@ -133,15 +133,23 @@ func (n *Normalizer) normalizeSQL(token *Token, lastToken *Token, normalizedSQLB if !n.config.KeepSQLAlias { // discard SQL alias if strings.ToUpper(token.Value) == "AS" { + // if current token is AS, then continue to next token + // because without seeing the next token, we cannot + // determine if the current token is an alias or not *lastToken = *token return } if strings.ToUpper(lastToken.Value) == "AS" { if token.Type == IDENT { + // if the last token is AS and the current token is IDENT, + // then the current token is an alias, so we discard it *lastToken = *token return } else { + // if the last token is AS and the current token is not IDENT, + // this could be a CTE like WITH ... AS (...), + // so we do not discard the current token appendWhitespace(lastToken, token, normalizedSQLBuilder) n.writeToken(lastToken, normalizedSQLBuilder) } @@ -174,6 +182,9 @@ func (n *Normalizer) writeToken(token *Token, normalizedSQLBuilder *strings.Buil func (n *Normalizer) isObfuscatedValueGroupable(token *Token, lastToken *Token, groupablePlaceholder *GroupablePlaceholder) bool { if token.Value == NumberPlaceholder || token.Value == StringPlaceholder { if lastToken.Value == "(" || lastToken.Value == "[" { + // if the last token is "(" or "[", and the current token is a placeholder, + // we know it's the start of groupable placeholders + // we don't return here because we still need to write the first placeholder groupablePlaceholder.groupable = true } else if lastToken.Value == "," && groupablePlaceholder.groupable { return true @@ -185,6 +196,7 @@ func (n *Normalizer) isObfuscatedValueGroupable(token *Token, lastToken *Token, } if groupablePlaceholder.groupable && (token.Value == ")" || token.Value == "]") { + // end of groupable placeholders groupablePlaceholder.groupable = false } From 1b11a0e362f9618005b96d02213c9723f97a5625 Mon Sep 17 00:00:00 2001 From: Zhengda Lu Date: Mon, 16 Oct 2023 11:04:56 -0400 Subject: [PATCH 3/4] unit test to cover postgres ANY keyword --- normalizer_test.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/normalizer_test.go b/normalizer_test.go index 695da32..b8c619d 100644 --- a/normalizer_test.go +++ b/normalizer_test.go @@ -743,6 +743,14 @@ func TestGroupObfuscatedValues(t *testing.T) { input: "[ ?,?]", expected: "[ ? ]", }, + { + input: "ANY(?)", + expected: "ANY ( ? )", + }, + { + input: "ANY(?, ?)", + expected: "ANY ( ? )", + }, } for _, test := range tests { From 5bf54fbc88d08552ae73c55096442d1cb4e05145 Mon Sep 17 00:00:00 2001 From: Zhengda Lu Date: Tue, 17 Oct 2023 13:55:07 -0400 Subject: [PATCH 4/4] make groupablePlaceholder non exportable --- normalizer.go | 8 ++++---- obfuscate_and_normalize.go | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/normalizer.go b/normalizer.go index a2e805e..bfa87ae 100644 --- a/normalizer.go +++ b/normalizer.go @@ -60,7 +60,7 @@ type StatementMetadata struct { Commands []string } -type GroupablePlaceholder struct { +type groupablePlaceholder struct { groupable bool } @@ -98,7 +98,7 @@ func (n *Normalizer) Normalize(input string, lexerOpts ...lexerOption) (normaliz } var lastToken Token // The last token that is not whitespace or comment - var groupablePlaceholder GroupablePlaceholder + var groupablePlaceholder groupablePlaceholder for _, token := range lexer.ScanAll() { n.collectMetadata(&token, &lastToken, statementMetadata) @@ -128,7 +128,7 @@ func (n *Normalizer) collectMetadata(token *Token, lastToken *Token, statementMe } } -func (n *Normalizer) normalizeSQL(token *Token, lastToken *Token, normalizedSQLBuilder *strings.Builder, groupablePlaceholder *GroupablePlaceholder) { +func (n *Normalizer) normalizeSQL(token *Token, lastToken *Token, normalizedSQLBuilder *strings.Builder, groupablePlaceholder *groupablePlaceholder) { if token.Type != WS && token.Type != COMMENT && token.Type != MULTILINE_COMMENT { if !n.config.KeepSQLAlias { // discard SQL alias @@ -179,7 +179,7 @@ func (n *Normalizer) writeToken(token *Token, normalizedSQLBuilder *strings.Buil } } -func (n *Normalizer) isObfuscatedValueGroupable(token *Token, lastToken *Token, groupablePlaceholder *GroupablePlaceholder) bool { +func (n *Normalizer) isObfuscatedValueGroupable(token *Token, lastToken *Token, groupablePlaceholder *groupablePlaceholder) bool { if token.Value == NumberPlaceholder || token.Value == StringPlaceholder { if lastToken.Value == "(" || lastToken.Value == "[" { // if the last token is "(" or "[", and the current token is a placeholder, diff --git a/obfuscate_and_normalize.go b/obfuscate_and_normalize.go index cc54860..9d4798e 100644 --- a/obfuscate_and_normalize.go +++ b/obfuscate_and_normalize.go @@ -19,7 +19,7 @@ func ObfuscateAndNormalize(input string, obfuscator *Obfuscator, normalizer *Nor } var lastToken Token // The last token that is not whitespace or comment - var groupablePlaceholder GroupablePlaceholder + var groupablePlaceholder groupablePlaceholder for _, token := range lexer.ScanAll() { token.Value = obfuscator.ObfuscateTokenValue(token, lexerOpts...)