Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

func to run obfuscation and normalization in one pass #12

Merged
merged 4 commits into from
Sep 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 43 additions & 51 deletions normalizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,14 +83,15 @@ const (
// Normalize takes an input SQL string and returns a normalized SQL string, a StatementMetadata struct, and an error.
// The normalizer collapses input SQL into compact format, groups obfuscated values into single placeholder,
// and collects metadata such as table names, comments, and commands.
func (n *Normalizer) Normalize(input string, lexerOpts ...lexerOption) (normalized string, info *StatementMetadata, err error) {
func (n *Normalizer) Normalize(input string, lexerOpts ...lexerOption) (normalizedSQL string, statementMetadata *StatementMetadata, err error) {
lexer := New(
input,
lexerOpts...,
)

var normalizedSQL string
var statementMetadata = &StatementMetadata{
var normalizedSQLBuilder strings.Builder

statementMetadata = &StatementMetadata{
Tables: []string{},
Comments: []string{},
Commands: []string{},
Expand All @@ -99,41 +100,15 @@ func (n *Normalizer) Normalize(input string, lexerOpts ...lexerOption) (normaliz
var lastToken Token // The last token that is not whitespace or comment

for _, token := range lexer.ScanAll() {
if token.Type == COMMENT || token.Type == MULTILINE_COMMENT {
// Collect comments
if n.config.CollectComments {
statementMetadata.Comments = append(statementMetadata.Comments, token.Value)
}
} else if token.Type == IDENT {
if isCommand(strings.ToUpper(token.Value)) && n.config.CollectCommands {
// Collect commands
statementMetadata.Commands = append(statementMetadata.Commands, strings.ToUpper(token.Value))
} else if isTableIndicator(strings.ToUpper(lastToken.Value)) {
// Collect table names
if n.config.CollectTables {
statementMetadata.Tables = append(statementMetadata.Tables, token.Value)
}
}
}

normalizedSQL = normalizeSQL(token, lastToken, normalizedSQL, n.config)

// TODO: We rely on the WS token to determine if we should add a whitespace
// This is not ideal, as SQLs with slightly different formatting will NOT be normalized into single family
// e.g. "SELECT * FROM table where id = ?" and "SELECT * FROM table where id= ?" will be normalized into different family
if token.Type != WS && token.Type != COMMENT && token.Type != MULTILINE_COMMENT {
lastToken = token
}
n.collectMetadata(token, lastToken, statementMetadata)
lastToken = n.normalizeSQL(token, lastToken, &normalizedSQLBuilder)
}

// We use regex to group consecutive obfuscated values into single placeholder.
// This is "less" performant than token by token processing,
// but it is much simpler to implement and maintain.
// The trade off made here is assuming normalization runs on backend
// where performance is not as critical as the agent.
normalizedSQL = normalizedSQLBuilder.String()

normalizedSQL = groupObfuscatedValues(normalizedSQL)
if !n.config.KeepSQLAlias {
normalizedSQL = DiscardSQLAlias(normalizedSQL)
normalizedSQL = discardSQLAlias(normalizedSQL)
}

// Dedupe collected metadata
Expand All @@ -142,27 +117,46 @@ func (n *Normalizer) Normalize(input string, lexerOpts ...lexerOption) (normaliz
return strings.TrimSpace(normalizedSQL), statementMetadata, nil
}

func normalizeSQL(token Token, lastToken Token, statement string, config *normalizerConfig) string {
if token.Type == WS || token.Type == COMMENT || token.Type == MULTILINE_COMMENT {
// We don't rely on the WS token to determine if we should add a whitespace
return statement
func (n *Normalizer) collectMetadata(token Token, lastToken Token, statementMetadata *StatementMetadata) {
if n.config.CollectComments && (token.Type == COMMENT || token.Type == MULTILINE_COMMENT) {
// Collect comments
statementMetadata.Comments = append(statementMetadata.Comments, token.Value)
} else if token.Type == IDENT {
if n.config.CollectCommands && isCommand(strings.ToUpper(token.Value)) {
// Collect commands
statementMetadata.Commands = append(statementMetadata.Commands, strings.ToUpper(token.Value))
} else if n.config.CollectTables && isTableIndicator(strings.ToUpper(lastToken.Value)) {
// Collect table names
statementMetadata.Tables = append(statementMetadata.Tables, token.Value)
}
}
}

// determine if we should add a whitespace
statement = appendWhitespace(lastToken, token, statement)
if isSQLKeyword(token) && config.UppercaseKeywords {
statement += strings.ToUpper(token.Value)
} else {
statement += token.Value
func (n *Normalizer) normalizeSQL(token Token, lastToken Token, normalizedSQLBuilder *strings.Builder) Token {
if token.Type != WS && token.Type != COMMENT && token.Type != MULTILINE_COMMENT {
// determine if we should add a whitespace
appendWhitespace(lastToken, token, normalizedSQLBuilder)
if n.config.UppercaseKeywords && isSQLKeyword(token) {
normalizedSQLBuilder.WriteString(strings.ToUpper(token.Value))
} else {
normalizedSQLBuilder.WriteString(token.Value)
}

lastToken = token
}

return statement
return lastToken
}

// groupObfuscatedValues groups consecutive obfuscated values in a SQL query into a single placeholder.
// It replaces "(?, ?, ...)" and "[?, ?, ...]" with "( ? )" and "[ ? ]", respectively.
// Returns the modified SQL query as a string.
func groupObfuscatedValues(input string) string {
// We use regex to group consecutive obfuscated values into single placeholder.
// This is "less" performant than token by token processing,
// but it is much simpler to implement and maintain.
// The trade off made here is assuming normalization runs on backend
// where performance is not as critical as the agent.
grouped := groupableRegex.ReplaceAllStringFunc(input, func(match string) string {
if match[0] == '(' {
return ArrayPlaceholder
Expand All @@ -172,11 +166,11 @@ func groupObfuscatedValues(input string) string {
return grouped
}

// DiscardSQLAlias removes any SQL alias from the input string and returns the modified string.
// discardSQLAlias removes any SQL alias from the input string and returns the modified string.
// It uses a regular expression to match the alias pattern and replace it with an empty string.
// The function is case-insensitive and matches the pattern "AS <alias_name>".
// The input string is not modified in place.
func DiscardSQLAlias(input string) string {
func discardSQLAlias(input string) string {
return sqlAliasRegex.ReplaceAllString(input, "")
}

Expand All @@ -201,7 +195,7 @@ func dedupeStatementMetadata(info *StatementMetadata) {
info.Commands = dedupeCollectedMetadata(info.Commands)
}

func appendWhitespace(lastToken Token, token Token, normalizedSQL string) string {
func appendWhitespace(lastToken Token, token Token, normalizedSQLBuilder *strings.Builder) {
switch token.Value {
case ",":
case "=":
Expand All @@ -212,8 +206,6 @@ func appendWhitespace(lastToken Token, token Token, normalizedSQL string) string
}
fallthrough
default:
normalizedSQL += " "
normalizedSQLBuilder.WriteString(" ")
}

return normalizedSQL
}
40 changes: 40 additions & 0 deletions obfuscate_and_normalize.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
package sqllexer

import "strings"

// ObfuscateAndNormalize takes an input SQL string and returns an normalized SQL string with metadata
// This function is a convenience function that combines the Obfuscator and Normalizer in one pass
func ObfuscateAndNormalize(input string, obfuscator *Obfuscator, normalizer *Normalizer, lexerOpts ...lexerOption) (normalizedSQL string, statementMetadata *StatementMetadata, err error) {
lexer := New(
input,
lexerOpts...,
)

var normalizedSQLBuilder strings.Builder

statementMetadata = &StatementMetadata{
Tables: []string{},
Comments: []string{},
Commands: []string{},
}

var lastToken Token // The last token that is not whitespace or comment

for _, token := range lexer.ScanAll() {
obfuscatedToken := Token{Type: token.Type, Value: obfuscator.ObfuscateTokenValue(token, lexerOpts...)}
normalizer.collectMetadata(obfuscatedToken, lastToken, statementMetadata)
lastToken = normalizer.normalizeSQL(obfuscatedToken, lastToken, &normalizedSQLBuilder)
}

normalizedSQL = normalizedSQLBuilder.String()

normalizedSQL = groupObfuscatedValues(normalizedSQL)
if !normalizer.config.KeepSQLAlias {
normalizedSQL = discardSQLAlias(normalizedSQL)
}

// Dedupe collected metadata
dedupeStatementMetadata(statementMetadata)

return strings.TrimSpace(normalizedSQL), statementMetadata, nil
}
112 changes: 112 additions & 0 deletions obfuscate_and_normalize_bench_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
package sqllexer

import (
"fmt"
"strconv"
"testing"
)

// Benchmark the Tokenizer using a SQL statement
func BenchmarkObfuscationAndNormalization(b *testing.B) {
// LargeQuery is sourced from https://stackoverflow.com/questions/12607667/issues-with-a-very-large-sql-query/12711494
var LargeQuery = `SELECT '%c%' as Chapter,
(SELECT count(ticket.id) AS Matches FROM engine.ticket INNER JOIN engine.ticket_custom ON ticket.id = ticket_custom.ticket
WHERE ticket_custom.name='chapter' AND ticket_custom.value LIKE '%c%' AND type='New material' AND milestone='1.1.12' AND component NOT LIKE 'internal_engine' AND ticket.status IN ('new','assigned') ) AS 'New',
(SELECT count(ticket.id) AS Matches FROM engine.ticket INNER JOIN engine.ticket_custom ON ticket.id = ticket_custom.ticket
WHERE ticket_custom.name='chapter' AND ticket_custom.value LIKE '%c%' AND type='New material' AND milestone='1.1.12' AND component NOT LIKE 'internal_engine' AND ticket.status='document_interface' ) AS 'Document\
Interface',
(SELECT count(ticket.id) AS Matches FROM engine.ticket INNER JOIN engine.ticket_custom ON ticket.id = ticket_custom.ticket
WHERE ticket_custom.name='chapter' AND ticket_custom.value LIKE '%c%' AND type='New material' AND milestone='1.1.12' AND component NOT LIKE 'internal_engine' AND ticket.status='interface_development' ) AS 'Inter\
face Development',
(SELECT count(ticket.id) AS Matches FROM engine.ticket INNER JOIN engine.ticket_custom ON ticket.id = ticket_custom.ticket
WHERE ticket_custom.name='chapter' AND ticket_custom.value LIKE '%c%' AND type='New material' AND milestone='1.1.12' AND component NOT LIKE 'internal_engine' AND ticket.status='interface_check' ) AS 'Interface C\
heck',
(SELECT count(ticket.id) AS Matches FROM engine.ticket INNER JOIN engine.ticket_custom ON ticket.id = ticket_custom.ticket
WHERE ticket_custom.name='chapter' AND ticket_custom.value LIKE '%c%' AND type='New material' AND milestone='1.1.12' AND component NOT LIKE 'internal_engine' AND ticket.status='document_routine' ) AS 'Document R\
outine',
(SELECT count(ticket.id) AS Matches FROM engine.ticket INNER JOIN engine.ticket_custom ON ticket.id = ticket_custom.ticket
WHERE ticket_custom.name='chapter' AND ticket_custom.value LIKE '%c%' AND type='New material' AND milestone='1.1.12' AND component NOT LIKE 'internal_engine' AND ticket.status='full_development' ) AS 'Full Devel\
opment',
(SELECT count(ticket.id) AS Matches FROM engine.ticket INNER JOIN engine.ticket_custom ON ticket.id = ticket_custom.ticket
WHERE ticket_custom.name='chapter' AND ticket_custom.value LIKE '%c%' AND type='New material' AND milestone='1.1.12' AND component NOT LIKE 'internal_engine' AND ticket.status='peer_review_1' ) AS 'Peer Review O\
ne',
(SELECT count(ticket.id) AS Matches FROM engine.ticket INNER JOIN engine.ticket_custom ON ticket.id = ticket_custom.ticket
WHERE ticket_custom.name='chapter' AND ticket_custom.value LIKE '%c%'AND type='New material' AND milestone='1.1.12' AND component NOT LIKE 'internal_engine' AND ticket.status='peer_review_2' ) AS 'Peer Review Tw\
o',
(SELECT count(ticket.id) AS Matches FROM engine.ticket INNER JOIN engine.ticket_custom ON ticket.id = ticket_custom.ticket
WHERE ticket_custom.name='chapter' AND ticket_custom.value LIKE '%c%' AND type='New material' AND milestone='1.1.12' AND component NOT LIKE 'internal_engine' AND ticket.status='qa' ) AS 'QA',
(SELECT count(ticket.id) AS Matches FROM engine.ticket INNER JOIN engine.ticket_custom ON ticket.id = ticket_custom.ticket
WHERE ticket_custom.name='chapter' AND ticket_custom.value LIKE '%c%'AND type='New material' AND milestone='1.1.12' AND component NOT LIKE 'internal_engine' AND ticket.status='closed' ) AS 'Closed',
count(id) AS Total,
ticket.id AS _id
FROM engine.ticket
INNER JOIN engine.ticket_custom ON ticket.id = ticket_custom.ticket
WHERE ticket_custom.name='chapter' AND ticket_custom.value LIKE '%c%' AND type='New material' AND milestone='1.1.12' AND component NOT LIKE 'internal_engine'`

// query3 is sourced from https://www.ibm.com/support/knowledgecenter/SSCRJT_6.0.0/com.ibm.swg.im.bigsql.doc/doc/tut_bsql_uc_complex_query.html
var ComplexQuery = `WITH
sales AS
(SELECT sf.*
FROM gosalesdw.sls_order_method_dim AS md,
gosalesdw.sls_product_dim AS pd,
gosalesdw.emp_employee_dim AS ed,
gosalesdw.sls_sales_fact AS sf
WHERE pd.product_key = sf.product_key
AND pd.product_number > 10000
AND pd.base_product_key > 30
AND md.order_method_key = sf.order_method_key
AND md.order_method_code > 5
AND ed.employee_key = sf.employee_key
AND ed.manager_code1 > 20),
inventory AS
(SELECT if.*
FROM gosalesdw.go_branch_dim AS bd,
gosalesdw.dist_inventory_fact AS if
WHERE if.branch_key = bd.branch_key
AND bd.branch_code > 20)
SELECT sales.product_key AS PROD_KEY,
SUM(CAST (inventory.quantity_shipped AS BIGINT)) AS INV_SHIPPED,
SUM(CAST (sales.quantity AS BIGINT)) AS PROD_QUANTITY,
RANK() OVER ( ORDER BY SUM(CAST (sales.quantity AS BIGINT)) DESC) AS PROD_RANK
FROM sales, inventory
WHERE sales.product_key = inventory.product_key
GROUP BY sales.product_key;
`

var superLargeQuery = "select top ? percent IdTrebEmpresa, CodCli, NOMEMP, Baixa, CASE WHEN IdCentreTreball IS ? THEN ? ELSE CONVERT ( VARCHAR ( ? ) IdCentreTreball ) END, CASE WHEN NOMESTAB IS ? THEN ? ELSE NOMESTAB END, TIPUS, CASE WHEN IdLloc IS ? THEN ? ELSE CONVERT ( VARCHAR ( ? ) IdLloc ) END, CASE WHEN NomLlocComplert IS ? THEN ? ELSE NomLlocComplert END, CASE WHEN DesLloc IS ? THEN ? ELSE DesLloc END, IdLlocTreballUnic From ( SELECT ?, dbo.Treb_Empresa.IdTrebEmpresa, dbo.Treb_Empresa.IdTreballador, dbo.Treb_Empresa.CodCli, dbo.Clients.NOMEMP, dbo.Treb_Empresa.Baixa, dbo.Treb_Empresa.IdCentreTreball, dbo.Cli_Establiments.NOMESTAB, ?, ?, dbo.Treb_Empresa.DataInici, dbo.Treb_Empresa.DataFi, CASE WHEN dbo.Treb_Empresa.DesLloc IS ? THEN ? ELSE dbo.Treb_Empresa.DesLloc END DesLloc, dbo.Treb_Empresa.IdLlocTreballUnic FROM dbo.Clients WITH ( NOLOCK ) INNER JOIN dbo.Treb_Empresa WITH ( NOLOCK ) ON dbo.Clients.CODCLI = dbo.Treb_Empresa.CodCli LEFT OUTER JOIN dbo.Cli_Establiments WITH ( NOLOCK ) ON dbo.Cli_Establiments.Id_ESTAB_CLI = dbo.Treb_Empresa.IdCentreTreball AND dbo.Cli_Establiments.CODCLI = dbo.Treb_Empresa.CodCli WHERE dbo.Treb_Empresa.IdTreballador = ? AND Treb_Empresa.IdTecEIRLLlocTreball IS ? AND IdMedEIRLLlocTreball IS ? AND IdLlocTreballTemporal IS ? UNION ALL SELECT ?, dbo.Treb_Empresa.IdTrebEmpresa, dbo.Treb_Empresa.IdTreballador, dbo.Treb_Empresa.CodCli, dbo.Clients.NOMEMP, dbo.Treb_Empresa.Baixa, dbo.Treb_Empresa.IdCentreTreball, dbo.Cli_Establiments.NOMESTAB, dbo.Treb_Empresa.IdTecEIRLLlocTreball, dbo.fn_NomLlocComposat ( dbo.Treb_Empresa.IdTecEIRLLlocTreball ), dbo.Treb_Empresa.DataInici, dbo.Treb_Empresa.DataFi, CASE WHEN dbo.Treb_Empresa.DesLloc IS ? THEN ? ELSE dbo.Treb_Empresa.DesLloc END DesLloc, dbo.Treb_Empresa.IdLlocTreballUnic FROM dbo.Clients WITH ( NOLOCK ) INNER JOIN dbo.Treb_Empresa WITH ( NOLOCK ) ON dbo.Clients.CODCLI = dbo.Treb_Empresa.CodCli LEFT OUTER JOIN dbo.Cli_Establiments WITH ( NOLOCK ) ON dbo.Cli_Establiments.Id_ESTAB_CLI = dbo.Treb_Empresa.IdCentreTreball AND dbo.Cli_Establiments.CODCLI = dbo.Treb_Empresa.CodCli WHERE ( dbo.Treb_Empresa.IdTreballador = ? ) AND ( NOT ( dbo.Treb_Empresa.IdTecEIRLLlocTreball IS ? ) ) UNION ALL SELECT ?, dbo.Treb_Empresa.IdTrebEmpresa, dbo.Treb_Empresa.IdTreballador, dbo.Treb_Empresa.CodCli, dbo.Clients.NOMEMP, dbo.Treb_Empresa.Baixa, dbo.Treb_Empresa.IdCentreTreball, dbo.Cli_Establiments.NOMESTAB, dbo.Treb_Empresa.IdMedEIRLLlocTreball, dbo.fn_NomMedEIRLLlocComposat ( dbo.Treb_Empresa.IdMedEIRLLlocTreball ), dbo.Treb_Empresa.DataInici, dbo.Treb_Empresa.DataFi, CASE WHEN dbo.Treb_Empresa.DesLloc IS ? THEN ? ELSE dbo.Treb_Empresa.DesLloc END DesLloc, dbo.Treb_Empresa.IdLlocTreballUnic FROM dbo.Clients WITH ( NOLOCK ) INNER JOIN dbo.Treb_Empresa WITH ( NOLOCK ) ON dbo.Clients.CODCLI = dbo.Treb_Empresa.CodCli LEFT OUTER JOIN dbo.Cli_Establiments WITH ( NOLOCK ) ON dbo.Cli_Establiments.Id_ESTAB_CLI = dbo.Treb_Empresa.IdCentreTreball AND dbo.Cli_Establiments.CODCLI = dbo.Treb_Empresa.CodCli WHERE ( dbo.Treb_Empresa.IdTreballador = ? ) AND ( Treb_Empresa.IdTecEIRLLlocTreball IS ? ) AND ( NOT ( dbo.Treb_Empresa.IdMedEIRLLlocTreball IS ? ) ) UNION ALL SELECT ?, dbo.Treb_Empresa.IdTrebEmpresa, dbo.Treb_Empresa.IdTreballador, dbo.Treb_Empresa.CodCli, dbo.Clients.NOMEMP, dbo.Treb_Empresa.Baixa, dbo.Treb_Empresa.IdCentreTreball, dbo.Cli_Establiments.NOMESTAB, dbo.Treb_Empresa.IdLlocTreballTemporal, dbo.Lloc_Treball_Temporal.NomLlocTreball, dbo.Treb_Empresa.DataInici, dbo.Treb_Empresa.DataFi, CASE WHEN dbo.Treb_Empresa.DesLloc IS ? THEN ? ELSE dbo.Treb_Empresa.DesLloc END DesLloc, dbo.Treb_Empresa.IdLlocTreballUnic FROM dbo.Clients WITH ( NOLOCK ) INNER JOIN dbo.Treb_Empresa WITH ( NOLOCK ) ON dbo.Clients.CODCLI = dbo.Treb_Empresa.CodCli INNER JOIN dbo.Lloc_Treball_Temporal WITH ( NOLOCK ) ON dbo.Treb_Empresa.IdLlocTreballTemporal = dbo.Lloc_Treball_Temporal.IdLlocTreballTemporal LEFT OUTER JOIN dbo.Cli_Establiments WITH ( NOLOCK ) ON dbo.Cli_Establiments.Id_ESTAB_CLI = dbo.Treb_Empresa.IdCentreTreball AND dbo.Cli_Establiments.CODCLI = dbo.Treb_Empresa.CodCli WHERE dbo.Treb_Empresa.IdTreballador = ? AND Treb_Empresa.IdTecEIRLLlocTreball IS ? AND IdMedEIRLLlocTreball IS ? ) Where ? = %d"

benchmarks := []struct {
name string
query string
}{
{"Escaping", `INSERT INTO delayed_jobs (attempts, created_at, failed_at, handler, last_error, locked_at, locked_by, priority, queue, run_at, updated_at) VALUES (0, '2016-12-04 17:09:59', NULL, '--- !ruby/object:Delayed::PerformableMethod\nobject: !ruby/object:Item\n store:\n - a simple string\n - an \'escaped \' string\n - another \'escaped\' string\n - 42\n string: a string with many \\\\\'escapes\\\\\'\nmethod_name: :show_store\nargs: []\n', NULL, NULL, NULL, 0, NULL, '2016-12-04 17:09:59', '2016-12-04 17:09:59')`},
{"Grouping", `INSERT INTO delayed_jobs (created_at, failed_at, handler) VALUES (0, '2016-12-04 17:09:59', NULL), (0, '2016-12-04 17:09:59', NULL), (0, '2016-12-04 17:09:59', NULL), (0, '2016-12-04 17:09:59', NULL)`},
{"Large", LargeQuery},
{"Complex", ComplexQuery},
{"SuperLarge", fmt.Sprintf(superLargeQuery, 1)},
}
obfuscator := NewObfuscator(
WithReplaceDigits(true),
WithDollarQuotedFunc(true),
)

normalizer := NewNormalizer(
WithCollectComments(true),
WithCollectCommands(true),
WithCollectTables(true),
WithKeepSQLAlias(false),
)

for _, bm := range benchmarks {
b.Run(bm.name+"/"+strconv.Itoa(len(bm.query)), func(b *testing.B) {
b.ResetTimer()
b.ReportAllocs()
for i := 0; i < b.N; i++ {
_, _, err := ObfuscateAndNormalize(bm.query, obfuscator, normalizer)
if err != nil {
b.Fatal(err)
}
}
})
}
}
Loading