From 708b62d763bd193faf2c59da96c3cf0f73e0a52c Mon Sep 17 00:00:00 2001
From: Zhengda Lu <zhengda.lu@datadoghq.com>
Date: Fri, 13 Oct 2023 16:14:53 -0400
Subject: [PATCH 1/4] remove regex based normalization

---
 normalizer.go              | 104 ++++++++++++++++++++---------------
 normalizer_test.go         |   3 +-
 obfuscate_and_normalize.go |  12 ++---
 obfuscator.go              |   2 +-
 sqllexer_utils.go          | 108 +++++++++++++++++++++++++++++++++----
 5 files changed, 165 insertions(+), 64 deletions(-)

diff --git a/normalizer.go b/normalizer.go
index ea6baab..47b562e 100644
--- a/normalizer.go
+++ b/normalizer.go
@@ -60,6 +60,10 @@ type StatementMetadata struct {
 	Commands []string
 }
 
+type GroupablePlaceholder struct {
+	groupable bool
+}
+
 type Normalizer struct {
 	config *normalizerConfig
 }
@@ -76,11 +80,6 @@ func NewNormalizer(opts ...normalizerOption) *Normalizer {
 	return &normalizer
 }
 
-const (
-	ArrayPlaceholder   = "( ? )"
-	BracketPlaceholder = "[ ? ]"
-)
-
 // Normalize takes an input SQL string and returns a normalized SQL string, a StatementMetadata struct, and an error.
 // The normalizer collapses input SQL into compact format, groups obfuscated values into single placeholder,
 // and collects metadata such as table names, comments, and commands.
@@ -99,26 +98,22 @@ func (n *Normalizer) Normalize(input string, lexerOpts ...lexerOption) (normaliz
 	}
 
 	var lastToken Token // The last token that is not whitespace or comment
+	var groupablePlaceholder GroupablePlaceholder
 
 	for _, token := range lexer.ScanAll() {
-		n.collectMetadata(token, lastToken, statementMetadata)
-		lastToken = n.normalizeSQL(token, lastToken, &normalizedSQLBuilder)
+		n.collectMetadata(&token, &lastToken, statementMetadata)
+		n.normalizeSQL(&token, &lastToken, &normalizedSQLBuilder, &groupablePlaceholder)
 	}
 
 	normalizedSQL = normalizedSQLBuilder.String()
 
-	normalizedSQL = groupObfuscatedValues(normalizedSQL)
-	if !n.config.KeepSQLAlias {
-		normalizedSQL = discardSQLAlias(normalizedSQL)
-	}
-
 	// Dedupe collected metadata
 	dedupeStatementMetadata(statementMetadata)
 
 	return strings.TrimSpace(normalizedSQL), statementMetadata, nil
 }
 
-func (n *Normalizer) collectMetadata(token Token, lastToken Token, statementMetadata *StatementMetadata) {
+func (n *Normalizer) collectMetadata(token *Token, lastToken *Token, statementMetadata *StatementMetadata) {
 	if n.config.CollectComments && (token.Type == COMMENT || token.Type == MULTILINE_COMMENT) {
 		// Collect comments
 		statementMetadata.Comments = append(statementMetadata.Comments, token.Value)
@@ -133,46 +128,67 @@ func (n *Normalizer) collectMetadata(token Token, lastToken Token, statementMeta
 	}
 }
 
-func (n *Normalizer) normalizeSQL(token Token, lastToken Token, normalizedSQLBuilder *strings.Builder) Token {
+func (n *Normalizer) normalizeSQL(token *Token, lastToken *Token, normalizedSQLBuilder *strings.Builder, groupablePlaceholder *GroupablePlaceholder) {
 	if token.Type != WS && token.Type != COMMENT && token.Type != MULTILINE_COMMENT {
+		if !n.config.KeepSQLAlias {
+			// discard SQL alias
+			if strings.ToUpper(token.Value) == "AS" {
+				*lastToken = *token
+				return
+			}
+
+			if strings.ToUpper(lastToken.Value) == "AS" {
+				if token.Type == IDENT {
+					*lastToken = *token
+					return
+				} else {
+					appendWhitespace(lastToken, token, normalizedSQLBuilder)
+					n.writeToken(lastToken, normalizedSQLBuilder)
+				}
+			}
+		}
+
+		// group consecutive obfuscated values into single placeholder
+		if n.isObfuscatedValueGroupable(token, lastToken, groupablePlaceholder) {
+			// return the token but not write it to the normalizedSQLBuilder
+			*lastToken = *token
+			return
+		}
+
 		// determine if we should add a whitespace
 		appendWhitespace(lastToken, token, normalizedSQLBuilder)
-		if n.config.UppercaseKeywords && isSQLKeyword(token) {
-			normalizedSQLBuilder.WriteString(strings.ToUpper(token.Value))
-		} else {
-			normalizedSQLBuilder.WriteString(token.Value)
-		}
+		n.writeToken(token, normalizedSQLBuilder)
 
-		lastToken = token
+		*lastToken = *token
 	}
+}
 
-	return lastToken
+func (n *Normalizer) writeToken(token *Token, normalizedSQLBuilder *strings.Builder) {
+	if n.config.UppercaseKeywords && isSQLKeyword(token) {
+		normalizedSQLBuilder.WriteString(strings.ToUpper(token.Value))
+	} else {
+		normalizedSQLBuilder.WriteString(token.Value)
+	}
 }
 
-// groupObfuscatedValues groups consecutive obfuscated values in a SQL query into a single placeholder.
-// It replaces "(?, ?, ...)" and "[?, ?, ...]" with "( ? )" and "[ ? ]", respectively.
-// Returns the modified SQL query as a string.
-func groupObfuscatedValues(input string) string {
-	// We use regex to group consecutive obfuscated values into single placeholder.
-	// This is "less" performant than token by token processing,
-	// but it is much simpler to implement and maintain.
-	// The trade off made here is assuming normalization runs on backend
-	// where performance is not as critical as the agent.
-	grouped := groupableRegex.ReplaceAllStringFunc(input, func(match string) string {
-		if match[0] == '(' {
-			return ArrayPlaceholder
+func (n *Normalizer) isObfuscatedValueGroupable(token *Token, lastToken *Token, groupablePlaceholder *GroupablePlaceholder) bool {
+	if token.Value == NumberPlaceholder || token.Value == StringPlaceholder {
+		if lastToken.Value == "(" || lastToken.Value == "[" {
+			groupablePlaceholder.groupable = true
+		} else if lastToken.Value == "," && groupablePlaceholder.groupable {
+			return true
 		}
-		return BracketPlaceholder
-	})
-	return grouped
-}
+	}
+
+	if (lastToken.Value == NumberPlaceholder || lastToken.Value == StringPlaceholder) && token.Value == "," && groupablePlaceholder.groupable {
+		return true
+	}
+
+	if groupablePlaceholder.groupable && (token.Value == ")" || token.Value == "]") {
+		groupablePlaceholder.groupable = false
+	}
 
-// discardSQLAlias removes any SQL alias from the input string and returns the modified string.
-// It uses a regular expression to match the alias pattern and replace it with an empty string.
-// The function is case-insensitive and matches the pattern "AS <alias_name>".
-// The input string is not modified in place.
-func discardSQLAlias(input string) string {
-	return sqlAliasRegex.ReplaceAllString(input, "")
+	return false
 }
 
 func dedupeCollectedMetadata(metadata []string) (dedupedMetadata []string, size int) {
@@ -198,7 +214,7 @@ func dedupeStatementMetadata(info *StatementMetadata) {
 	info.Size += tablesSize + commentsSize + commandsSize
 }
 
-func appendWhitespace(lastToken Token, token Token, normalizedSQLBuilder *strings.Builder) {
+func appendWhitespace(lastToken *Token, token *Token, normalizedSQLBuilder *strings.Builder) {
 	switch token.Value {
 	case ",":
 	case "=":
diff --git a/normalizer_test.go b/normalizer_test.go
index 3740769..695da32 100644
--- a/normalizer_test.go
+++ b/normalizer_test.go
@@ -747,7 +747,8 @@ func TestGroupObfuscatedValues(t *testing.T) {
 
 	for _, test := range tests {
 		t.Run("", func(t *testing.T) {
-			got := groupObfuscatedValues(test.input)
+			normalizer := NewNormalizer()
+			got, _, _ := normalizer.Normalize(test.input)
 			assert.Equal(t, test.expected, got)
 		})
 	}
diff --git a/obfuscate_and_normalize.go b/obfuscate_and_normalize.go
index 316d764..cc54860 100644
--- a/obfuscate_and_normalize.go
+++ b/obfuscate_and_normalize.go
@@ -19,20 +19,16 @@ func ObfuscateAndNormalize(input string, obfuscator *Obfuscator, normalizer *Nor
 	}
 
 	var lastToken Token // The last token that is not whitespace or comment
+	var groupablePlaceholder GroupablePlaceholder
 
 	for _, token := range lexer.ScanAll() {
-		obfuscatedToken := Token{Type: token.Type, Value: obfuscator.ObfuscateTokenValue(token, lexerOpts...)}
-		normalizer.collectMetadata(obfuscatedToken, lastToken, statementMetadata)
-		lastToken = normalizer.normalizeSQL(obfuscatedToken, lastToken, &normalizedSQLBuilder)
+		token.Value = obfuscator.ObfuscateTokenValue(token, lexerOpts...)
+		normalizer.collectMetadata(&token, &lastToken, statementMetadata)
+		normalizer.normalizeSQL(&token, &lastToken, &normalizedSQLBuilder, &groupablePlaceholder)
 	}
 
 	normalizedSQL = normalizedSQLBuilder.String()
 
-	normalizedSQL = groupObfuscatedValues(normalizedSQL)
-	if !normalizer.config.KeepSQLAlias {
-		normalizedSQL = discardSQLAlias(normalizedSQL)
-	}
-
 	// Dedupe collected metadata
 	dedupeStatementMetadata(statementMetadata)
 
diff --git a/obfuscator.go b/obfuscator.go
index ee0e658..daab528 100644
--- a/obfuscator.go
+++ b/obfuscator.go
@@ -114,7 +114,7 @@ func (o *Obfuscator) ObfuscateTokenValue(token Token, lexerOpts ...lexerOption)
 		}
 
 		if o.config.ReplaceDigits {
-			return replaceDigits(token.Value, "?")
+			return replaceDigits(token.Value, NumberPlaceholder)
 		} else {
 			return token.Value
 		}
diff --git a/sqllexer_utils.go b/sqllexer_utils.go
index e57de87..e14502c 100644
--- a/sqllexer_utils.go
+++ b/sqllexer_utils.go
@@ -1,7 +1,6 @@
 package sqllexer
 
 import (
-	"regexp"
 	"strings"
 	"unicode"
 )
@@ -19,7 +18,7 @@ const (
 	DBMSOracle DBMSType = "oracle"
 )
 
-var Commands = map[string]bool{
+var commands = map[string]bool{
 	"SELECT":   true,
 	"INSERT":   true,
 	"UPDATE":   true,
@@ -47,11 +46,96 @@ var tableIndicators = map[string]bool{
 	"TABLE":  true,
 }
 
-var keywordsRegex = regexp.MustCompile(`(?i)^(SELECT|INSERT|UPDATE|DELETE|CREATE|ALTER|DROP|GRANT|REVOKE|ADD|ALL|AND|ANY|AS|ASC|BEGIN|BETWEEN|BY|CASE|CHECK|COLUMN|COMMIT|CONSTRAINT|DATABASE|DECLARE|DEFAULT|DESC|DISTINCT|ELSE|END|EXEC|EXISTS|FOREIGN|FROM|GROUP|HAVING|IN|INDEX|INNER|INTO|IS|JOIN|KEY|LEFT|LIKE|LIMIT|NOT|ON|OR|ORDER|OUTER|PRIMARY|PROCEDURE|REPLACE|RETURNS|RIGHT|ROLLBACK|ROWNUM|SET|SOME|TABLE|TOP|TRUNCATE|UNION|UNIQUE|USE|VALUES|VIEW|WHERE|CUBE|ROLLUP|LITERAL|WINDOW|VACCUM|ANALYZE|ILIKE|USING|ASSERTION|DOMAIN|CLUSTER|COPY|EXPLAIN|PLPGSQL|TRIGGER|TEMPORARY|UNLOGGED|RECURSIVE|RETURNING)$`)
-
-var groupableRegex = regexp.MustCompile(`(\()\s*\?(?:\s*,\s*\?\s*)*\s*(\))|(\[)\s*\?(?:\s*,\s*\?\s*)*\s*(\])`)
-
-var sqlAliasRegex = regexp.MustCompile(`(?i)\s+AS\s+[\w?]+`)
+var keywords = map[string]bool{
+	"SELECT":     true,
+	"INSERT":     true,
+	"UPDATE":     true,
+	"DELETE":     true,
+	"CREATE":     true,
+	"ALTER":      true,
+	"DROP":       true,
+	"GRANT":      true,
+	"REVOKE":     true,
+	"ADD":        true,
+	"ALL":        true,
+	"AND":        true,
+	"ANY":        true,
+	"AS":         true,
+	"ASC":        true,
+	"BEGIN":      true,
+	"BETWEEN":    true,
+	"BY":         true,
+	"CASE":       true,
+	"CHECK":      true,
+	"COLUMN":     true,
+	"COMMIT":     true,
+	"CONSTRAINT": true,
+	"DATABASE":   true,
+	"DECLARE":    true,
+	"DEFAULT":    true,
+	"DESC":       true,
+	"DISTINCT":   true,
+	"ELSE":       true,
+	"END":        true,
+	"EXEC":       true,
+	"EXISTS":     true,
+	"FOREIGN":    true,
+	"FROM":       true,
+	"GROUP":      true,
+	"HAVING":     true,
+	"IN":         true,
+	"INDEX":      true,
+	"INNER":      true,
+	"INTO":       true,
+	"IS":         true,
+	"JOIN":       true,
+	"KEY":        true,
+	"LEFT":       true,
+	"LIKE":       true,
+	"LIMIT":      true,
+	"NOT":        true,
+	"ON":         true,
+	"OR":         true,
+	"ORDER":      true,
+	"OUTER":      true,
+	"PRIMARY":    true,
+	"PROCEDURE":  true,
+	"REPLACE":    true,
+	"RETURNS":    true,
+	"RIGHT":      true,
+	"ROLLBACK":   true,
+	"ROWNUM":     true,
+	"SET":        true,
+	"SOME":       true,
+	"TABLE":      true,
+	"TOP":        true,
+	"TRUNCATE":   true,
+	"UNION":      true,
+	"UNIQUE":     true,
+	"USE":        true,
+	"VALUES":     true,
+	"VIEW":       true,
+	"WHERE":      true,
+	"CUBE":       true,
+	"ROLLUP":     true,
+	"LITERAL":    true,
+	"WINDOW":     true,
+	"VACCUM":     true,
+	"ANALYZE":    true,
+	"ILIKE":      true,
+	"USING":      true,
+	"ASSERTION":  true,
+	"DOMAIN":     true,
+	"CLUSTER":    true,
+	"COPY":       true,
+	"EXPLAIN":    true,
+	"PLPGSQL":    true,
+	"TRIGGER":    true,
+	"TEMPORARY":  true,
+	"UNLOGGED":   true,
+	"RECURSIVE":  true,
+	"RETURNING":  true,
+}
 
 func isWhitespace(ch rune) bool {
 	return ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r'
@@ -106,7 +190,7 @@ func isEOF(ch rune) bool {
 }
 
 func isCommand(ident string) bool {
-	_, ok := Commands[ident]
+	_, ok := commands[ident]
 	return ok
 }
 
@@ -115,8 +199,12 @@ func isTableIndicator(ident string) bool {
 	return ok
 }
 
-func isSQLKeyword(token Token) bool {
-	return token.Type == IDENT && keywordsRegex.MatchString(token.Value)
+func isSQLKeyword(token *Token) bool {
+	if token.Type != IDENT {
+		return false
+	}
+	_, ok := keywords[strings.ToUpper(token.Value)]
+	return ok
 }
 
 func isBoolean(ident string) bool {

From 7e63af8d046cf6dac3850956e52b68c124ecebdb Mon Sep 17 00:00:00 2001
From: Zhengda Lu <zhengda.lu@datadoghq.com>
Date: Fri, 13 Oct 2023 16:24:52 -0400
Subject: [PATCH 2/4] add comments

---
 normalizer.go | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/normalizer.go b/normalizer.go
index 47b562e..a2e805e 100644
--- a/normalizer.go
+++ b/normalizer.go
@@ -133,15 +133,23 @@ func (n *Normalizer) normalizeSQL(token *Token, lastToken *Token, normalizedSQLB
 		if !n.config.KeepSQLAlias {
 			// discard SQL alias
 			if strings.ToUpper(token.Value) == "AS" {
+				// if current token is AS, then continue to next token
+				// because without seeing the next token, we cannot
+				// determine if the current token is an alias or not
 				*lastToken = *token
 				return
 			}
 
 			if strings.ToUpper(lastToken.Value) == "AS" {
 				if token.Type == IDENT {
+					// if the last token is AS and the current token is IDENT,
+					// then the current token is an alias, so we discard it
 					*lastToken = *token
 					return
 				} else {
+					// if the last token is AS and the current token is not IDENT,
+					// this could be a CTE like WITH ... AS (...),
+					// so we do not discard the current token
 					appendWhitespace(lastToken, token, normalizedSQLBuilder)
 					n.writeToken(lastToken, normalizedSQLBuilder)
 				}
@@ -174,6 +182,9 @@ func (n *Normalizer) writeToken(token *Token, normalizedSQLBuilder *strings.Buil
 func (n *Normalizer) isObfuscatedValueGroupable(token *Token, lastToken *Token, groupablePlaceholder *GroupablePlaceholder) bool {
 	if token.Value == NumberPlaceholder || token.Value == StringPlaceholder {
 		if lastToken.Value == "(" || lastToken.Value == "[" {
+			// if the last token is "(" or "[", and the current token is a placeholder,
+			// we know it's the start of groupable placeholders
+			// we don't return here because we still need to write the first placeholder
 			groupablePlaceholder.groupable = true
 		} else if lastToken.Value == "," && groupablePlaceholder.groupable {
 			return true
@@ -185,6 +196,7 @@ func (n *Normalizer) isObfuscatedValueGroupable(token *Token, lastToken *Token,
 	}
 
 	if groupablePlaceholder.groupable && (token.Value == ")" || token.Value == "]") {
+		// end of groupable placeholders
 		groupablePlaceholder.groupable = false
 	}
 

From 1b11a0e362f9618005b96d02213c9723f97a5625 Mon Sep 17 00:00:00 2001
From: Zhengda Lu <zhengda.lu@datadoghq.com>
Date: Mon, 16 Oct 2023 11:04:56 -0400
Subject: [PATCH 3/4] unit test to cover postgres ANY keyword

---
 normalizer_test.go | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/normalizer_test.go b/normalizer_test.go
index 695da32..b8c619d 100644
--- a/normalizer_test.go
+++ b/normalizer_test.go
@@ -743,6 +743,14 @@ func TestGroupObfuscatedValues(t *testing.T) {
 			input:    "[ ?,?]",
 			expected: "[ ? ]",
 		},
+		{
+			input:    "ANY(?)",
+			expected: "ANY ( ? )",
+		},
+		{
+			input:    "ANY(?, ?)",
+			expected: "ANY ( ? )",
+		},
 	}
 
 	for _, test := range tests {

From 5bf54fbc88d08552ae73c55096442d1cb4e05145 Mon Sep 17 00:00:00 2001
From: Zhengda Lu <zhengda.lu@datadoghq.com>
Date: Tue, 17 Oct 2023 13:55:07 -0400
Subject: [PATCH 4/4] make groupablePlaceholder non exportable

---
 normalizer.go              | 8 ++++----
 obfuscate_and_normalize.go | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/normalizer.go b/normalizer.go
index a2e805e..bfa87ae 100644
--- a/normalizer.go
+++ b/normalizer.go
@@ -60,7 +60,7 @@ type StatementMetadata struct {
 	Commands []string
 }
 
-type GroupablePlaceholder struct {
+type groupablePlaceholder struct {
 	groupable bool
 }
 
@@ -98,7 +98,7 @@ func (n *Normalizer) Normalize(input string, lexerOpts ...lexerOption) (normaliz
 	}
 
 	var lastToken Token // The last token that is not whitespace or comment
-	var groupablePlaceholder GroupablePlaceholder
+	var groupablePlaceholder groupablePlaceholder
 
 	for _, token := range lexer.ScanAll() {
 		n.collectMetadata(&token, &lastToken, statementMetadata)
@@ -128,7 +128,7 @@ func (n *Normalizer) collectMetadata(token *Token, lastToken *Token, statementMe
 	}
 }
 
-func (n *Normalizer) normalizeSQL(token *Token, lastToken *Token, normalizedSQLBuilder *strings.Builder, groupablePlaceholder *GroupablePlaceholder) {
+func (n *Normalizer) normalizeSQL(token *Token, lastToken *Token, normalizedSQLBuilder *strings.Builder, groupablePlaceholder *groupablePlaceholder) {
 	if token.Type != WS && token.Type != COMMENT && token.Type != MULTILINE_COMMENT {
 		if !n.config.KeepSQLAlias {
 			// discard SQL alias
@@ -179,7 +179,7 @@ func (n *Normalizer) writeToken(token *Token, normalizedSQLBuilder *strings.Buil
 	}
 }
 
-func (n *Normalizer) isObfuscatedValueGroupable(token *Token, lastToken *Token, groupablePlaceholder *GroupablePlaceholder) bool {
+func (n *Normalizer) isObfuscatedValueGroupable(token *Token, lastToken *Token, groupablePlaceholder *groupablePlaceholder) bool {
 	if token.Value == NumberPlaceholder || token.Value == StringPlaceholder {
 		if lastToken.Value == "(" || lastToken.Value == "[" {
 			// if the last token is "(" or "[", and the current token is a placeholder,
diff --git a/obfuscate_and_normalize.go b/obfuscate_and_normalize.go
index cc54860..9d4798e 100644
--- a/obfuscate_and_normalize.go
+++ b/obfuscate_and_normalize.go
@@ -19,7 +19,7 @@ func ObfuscateAndNormalize(input string, obfuscator *Obfuscator, normalizer *Nor
 	}
 
 	var lastToken Token // The last token that is not whitespace or comment
-	var groupablePlaceholder GroupablePlaceholder
+	var groupablePlaceholder groupablePlaceholder
 
 	for _, token := range lexer.ScanAll() {
 		token.Value = obfuscator.ObfuscateTokenValue(token, lexerOpts...)