From ba67867ca58590fb2ca5596df7908ee0946ad4ec Mon Sep 17 00:00:00 2001 From: Zhengda Lu Date: Wed, 21 Aug 2024 16:39:24 -0400 Subject: [PATCH] do not collect CTEs as tables --- normalizer.go | 15 +++++++++++---- normalizer_test.go | 12 ++++++------ obfuscate_and_normalize.go | 4 +++- obfuscate_and_normalize_test.go | 4 ++-- .../extremely-complex-poorly-written-sql.json | 4 ++-- testdata/mssql/delete/delete-with-cte.json | 4 ++-- testdata/mssql/select/select-with-cte.json | 4 ++-- testdata/mssql/update/update-with-cte.json | 4 ++-- .../complex/super-complex-oracle-query.json | 4 ++-- .../oracle/select/select-using-with-clause.json | 4 ++-- testdata/postgresql/delete/delete-with-cte.json | 5 ++--- .../select/common-table-expressions-cte.json | 5 ++--- testdata/postgresql/update/update-with-cte.json | 5 ++--- 13 files changed, 40 insertions(+), 34 deletions(-) diff --git a/normalizer.go b/normalizer.go index fe437e0..aa55702 100644 --- a/normalizer.go +++ b/normalizer.go @@ -141,12 +141,14 @@ func (n *Normalizer) Normalize(input string, lexerOpts ...lexerOption) (normaliz var lastToken Token // The last token that is not whitespace or comment var groupablePlaceholder groupablePlaceholder + ctes := make(map[string]bool) // Holds the CTEs that are currently being processed + for { token := lexer.Scan() if token.Type == EOF { break } - n.collectMetadata(&token, &lastToken, statementMetadata) + n.collectMetadata(&token, &lastToken, statementMetadata, ctes) n.normalizeSQL(&token, &lastToken, &normalizedSQLBuilder, &groupablePlaceholder, lexerOpts...) } @@ -158,7 +160,7 @@ func (n *Normalizer) Normalize(input string, lexerOpts ...lexerOption) (normaliz return n.trimNormalizedSQL(normalizedSQL), statementMetadata, nil } -func (n *Normalizer) collectMetadata(token *Token, lastToken *Token, statementMetadata *StatementMetadata) { +func (n *Normalizer) collectMetadata(token *Token, lastToken *Token, statementMetadata *StatementMetadata, ctes map[string]bool) { if n.config.CollectComments && (token.Type == COMMENT || token.Type == MULTILINE_COMMENT) { // Collect comments statementMetadata.Comments = append(statementMetadata.Comments, token.Value) @@ -175,9 +177,14 @@ func (n *Normalizer) collectMetadata(token *Token, lastToken *Token, statementMe if n.config.CollectCommands && isCommand(strings.ToUpper(tokenVal)) { // Collect commands statementMetadata.Commands = append(statementMetadata.Commands, strings.ToUpper(tokenVal)) + } else if strings.ToUpper(lastToken.Value) == "WITH" && token.Type == IDENT { + // Collect CTEs so we can skip them later in table collection + ctes[tokenVal] = true } else if n.config.CollectTables && isTableIndicator(strings.ToUpper(lastToken.Value)) && !isSQLKeyword(token) { - // Collect table names - statementMetadata.Tables = append(statementMetadata.Tables, tokenVal) + // Collect table names the token is not a CTE + if _, ok := ctes[tokenVal]; !ok { + statementMetadata.Tables = append(statementMetadata.Tables, tokenVal) + } } else if n.config.CollectProcedure && isProcedure(lastToken) { // Collect procedure names statementMetadata.Procedures = append(statementMetadata.Procedures, tokenVal) diff --git a/normalizer_test.go b/normalizer_test.go index e3e36d1..730ae63 100644 --- a/normalizer_test.go +++ b/normalizer_test.go @@ -200,22 +200,22 @@ multiline comment */ `, expected: "WITH cte AS ( SELECT id, name, age FROM person WHERE age > ? ) UPDATE person SET age = ? WHERE id IN ( SELECT id FROM cte ); INSERT INTO person ( name, age ) SELECT name, ? FROM cte WHERE age <= ?", statementMetadata: StatementMetadata{ - Tables: []string{"person", "cte"}, + Tables: []string{"person"}, Comments: []string{}, Commands: []string{"SELECT", "UPDATE", "INSERT"}, Procedures: []string{}, - Size: 27, + Size: 24, }, }, { input: "WITH updates AS ( UPDATE metrics_metadata SET metric_type = ? updated = ? :: timestamp, interval = ? unit_id = ? per_unit_id = ? description = ? orientation = ? integration = ? short_name = ? WHERE metric_key = ? AND org_id = ? RETURNING ? ) INSERT INTO metrics_metadata ( org_id, metric_key, metric_type, interval, unit_id, per_unit_id, description, orientation, integration, short_name ) SELECT ? WHERE NOT EXISTS ( SELECT ? FROM updates )", expected: "WITH updates AS ( UPDATE metrics_metadata SET metric_type = ? updated = ? :: timestamp, interval = ? unit_id = ? per_unit_id = ? description = ? orientation = ? integration = ? short_name = ? WHERE metric_key = ? AND org_id = ? RETURNING ? ) INSERT INTO metrics_metadata ( org_id, metric_key, metric_type, interval, unit_id, per_unit_id, description, orientation, integration, short_name ) SELECT ? WHERE NOT EXISTS ( SELECT ? FROM updates )", statementMetadata: StatementMetadata{ - Tables: []string{"metrics_metadata", "updates"}, + Tables: []string{"metrics_metadata"}, Comments: []string{}, Commands: []string{"UPDATE", "INSERT", "SELECT"}, Procedures: []string{}, - Size: 41, + Size: 34, }, }, { @@ -283,11 +283,11 @@ multiline comment */ input: "/* Testing explicit table SQL expression */ WITH T1 AS (SELECT PNO , PNAME , COLOR , WEIGHT , CITY FROM P WHERE CITY = ?), T2 AS (SELECT PNO, PNAME, COLOR, WEIGHT, CITY, ? * WEIGHT AS NEW_WEIGHT, ? AS NEW_CITY FROM T1), T3 AS ( SELECT PNO , PNAME, COLOR, NEW_WEIGHT AS WEIGHT, NEW_CITY AS CITY FROM T2), T4 AS ( TABLE P EXCEPT CORRESPONDING TABLE T1) TABLE T4 UNION CORRESPONDING TABLE T3", expected: "WITH T1 AS ( SELECT PNO, PNAME, COLOR, WEIGHT, CITY FROM P WHERE CITY = ? ), T2 AS ( SELECT PNO, PNAME, COLOR, WEIGHT, CITY, ? * WEIGHT, ? FROM T1 ), T3 AS ( SELECT PNO, PNAME, COLOR, NEW_WEIGHT, NEW_CITY FROM T2 ), T4 AS ( TABLE P EXCEPT CORRESPONDING TABLE T1 ) TABLE T4 UNION CORRESPONDING TABLE T3", statementMetadata: StatementMetadata{ - Tables: []string{"P", "T1", "T2", "T4", "T3"}, + Tables: []string{"P", "T2", "T4", "T3"}, Comments: []string{"/* Testing explicit table SQL expression */"}, Commands: []string{"SELECT"}, Procedures: []string{}, - Size: 58, + Size: 56, }, }, { diff --git a/obfuscate_and_normalize.go b/obfuscate_and_normalize.go index b52c46c..bf60c02 100644 --- a/obfuscate_and_normalize.go +++ b/obfuscate_and_normalize.go @@ -22,13 +22,15 @@ func ObfuscateAndNormalize(input string, obfuscator *Obfuscator, normalizer *Nor var lastToken Token // The last token that is not whitespace or comment var groupablePlaceholder groupablePlaceholder + ctes := make(map[string]bool) // Holds the CTEs that are currently being processed + for { token := lexer.Scan() if token.Type == EOF { break } token.Value = obfuscator.ObfuscateTokenValue(token, lexerOpts...) - normalizer.collectMetadata(&token, &lastToken, statementMetadata) + normalizer.collectMetadata(&token, &lastToken, statementMetadata, ctes) normalizer.normalizeSQL(&token, &lastToken, &normalizedSQLBuilder, &groupablePlaceholder, lexerOpts...) } diff --git a/obfuscate_and_normalize_test.go b/obfuscate_and_normalize_test.go index 7f10eb0..d84cb78 100644 --- a/obfuscate_and_normalize_test.go +++ b/obfuscate_and_normalize_test.go @@ -308,11 +308,11 @@ multiline comment */ SELECT * FROM SILENCES WHERE ROW_NUMBER = 1;`, expected: `WITH SILENCES AS ( SELECT LOWER ( BASE_TABLE_NAME ), CREATED_DT, SILENCE_UNTIL_DT, REASON, ROW_NUMBER ( ) OVER ( PARTITION BY LOWER ( BASE_TABLE_NAME ) ORDER BY CREATED_DT DESC ) FROM REPORTING.GENERAL.SOME_TABLE WHERE CONTAINS ( ?, LOWER ( DATACENTER_LABEL ) ) ) SELECT * FROM SILENCES WHERE ROW_NUMBER = ?`, statementMetadata: StatementMetadata{ - Tables: []string{"REPORTING.GENERAL.SOME_TABLE", "SILENCES"}, + Tables: []string{"REPORTING.GENERAL.SOME_TABLE"}, Comments: []string{}, Commands: []string{"SELECT"}, Procedures: []string{}, - Size: 42, + Size: 34, }, lexerOpts: []lexerOption{ WithDBMS(DBMSSnowflake), diff --git a/testdata/mssql/complex/extremely-complex-poorly-written-sql.json b/testdata/mssql/complex/extremely-complex-poorly-written-sql.json index a8442bd..978713a 100644 --- a/testdata/mssql/complex/extremely-complex-poorly-written-sql.json +++ b/testdata/mssql/complex/extremely-complex-poorly-written-sql.json @@ -4,8 +4,8 @@ { "expected": "WITH ComplexCTE AS ( SELECT t?.id, t?.amount, ROW_NUMBER ( ) OVER ( PARTITION BY t?.customer_id ORDER BY t?.amount DESC ) FROM ( SELECT id, customer_id, status FROM orders WHERE YEAR ( order_date ) = YEAR ( GETDATE ( ) ) AND status NOT IN ( ? ) ) t? INNER JOIN ( SELECT order_id, SUM ( amount ) FROM order_details GROUP BY order_id ) t? ON t?.id = t?.order_id WHERE t?.amount > ? ), SecondCTE AS ( SELECT c?. *, c?.name, c?.region FROM ComplexCTE c? INNER JOIN customers c? ON c?.customer_id = c?.id WHERE c?.region IN ( ? ) AND c?.rn < ? ) SELECT s.id, s.name, s.amount, p.product_name, CASE WHEN s.amount > ? THEN ? ELSE ? END FROM SecondCTE s LEFT JOIN ( SELECT DISTINCT p?.order_id, p?.product_name FROM order_products p? INNER JOIN products p? ON p?.product_id = p?.id ) p ON s.id = p.order_id WHERE s.region = ? AND s.status LIKE ? ORDER BY s.amount DESC, s.name", "statement_metadata": { - "size": 79, - "tables": ["orders", "order_details", "ComplexCTE", "customers", "SecondCTE", "order_products", "products"], + "size": 69, + "tables": ["orders", "order_details", "customers", "SecondCTE", "order_products", "products"], "commands": ["SELECT", "JOIN"], "comments": [], "procedures": [] diff --git a/testdata/mssql/delete/delete-with-cte.json b/testdata/mssql/delete/delete-with-cte.json index 6912195..70d40f8 100644 --- a/testdata/mssql/delete/delete-with-cte.json +++ b/testdata/mssql/delete/delete-with-cte.json @@ -4,8 +4,8 @@ { "expected": "WITH OldOrders AS ( SELECT id FROM orders WHERE order_date < ? ) DELETE FROM orders WHERE id IN ( SELECT id FROM OldOrders )", "statement_metadata": { - "size": 27, - "tables": ["orders", "OldOrders"], + "size": 18, + "tables": ["orders"], "commands": ["SELECT", "DELETE"], "comments": [], "procedures": [] diff --git a/testdata/mssql/select/select-with-cte.json b/testdata/mssql/select/select-with-cte.json index 48add7e..13155e0 100644 --- a/testdata/mssql/select/select-with-cte.json +++ b/testdata/mssql/select/select-with-cte.json @@ -4,8 +4,8 @@ { "expected": "WITH RankedOrders AS ( SELECT o.id, o.customer_id, RANK ( ) OVER ( PARTITION BY o.customer_id ORDER BY o.amount DESC ) FROM orders o ) SELECT id FROM RankedOrders WHERE rnk = ?", "statement_metadata": { - "size": 24, - "tables": ["orders", "RankedOrders"], + "size": 12, + "tables": ["orders"], "commands": ["SELECT"], "comments": [], "procedures": [] diff --git a/testdata/mssql/update/update-with-cte.json b/testdata/mssql/update/update-with-cte.json index c167ed3..97a26cc 100644 --- a/testdata/mssql/update/update-with-cte.json +++ b/testdata/mssql/update/update-with-cte.json @@ -4,8 +4,8 @@ { "expected": "WITH UpdatedOrders AS ( SELECT id FROM orders WHERE order_date < GETDATE ( ) - ? ) UPDATE o SET o.status = ? FROM orders o JOIN UpdatedOrders uo ON o.id = uo.id", "statement_metadata": { - "size": 36, - "tables": ["orders", "o", "UpdatedOrders"], + "size": 23, + "tables": ["orders", "o"], "commands": ["SELECT", "UPDATE", "JOIN"], "comments": [], "procedures": [] diff --git a/testdata/oracle/complex/super-complex-oracle-query.json b/testdata/oracle/complex/super-complex-oracle-query.json index c1564cc..18fb4c2 100644 --- a/testdata/oracle/complex/super-complex-oracle-query.json +++ b/testdata/oracle/complex/super-complex-oracle-query.json @@ -4,8 +4,8 @@ { "expected": "WITH ranked_sales AS ( SELECT product_id, SUM ( amount ), RANK ( ) OVER ( ORDER BY SUM ( amount ) DESC ) sales_rank FROM sales GROUP BY product_id ), dept_costs AS ( SELECT department_id, SUM ( test_amt ) FROM employees GROUP BY department_id ), latest_transactions AS ( SELECT t.account_id, t.amount, ROW_NUMBER ( ) OVER ( PARTITION BY t.account_id ORDER BY t.transaction_date DESC ) rn FROM transactions t WHERE t.transaction_date >= ADD_MONTHS ( SYSDATE, ? ) ) SELECT e.employee_id, e.last_name, e.test_amt, d.department_name, d.location_id, rs.total_sales, rs.sales_rank, lt.amount FROM employees e INNER JOIN departments d ON e.department_id = d.id LEFT JOIN ranked_sales rs ON e.product_id = rs.product_id LEFT JOIN latest_transactions lt ON e.account_id = lt.account_id AND lt.rn = ? WHERE e.hire_date > ? AND ( d.budget > ( SELECT AVG ( total_sal ) FROM dept_costs ) OR e.test_amt > ( SELECT AVG ( test_amt ) FROM employees WHERE department_id = e.department_id ) ) AND EXISTS ( SELECT ? FROM customer_orders co WHERE co.employee_id = e.employee_id AND co.order_status = ? ) ORDER BY e.department_id, e.test_amt DESC", "statement_metadata": { - "size": 103, - "tables": ["sales", "employees", "transactions", "departments", "ranked_sales", "latest_transactions", "dept_costs", "customer_orders"], + "size": 91, + "tables": ["sales", "employees", "transactions", "departments", "latest_transactions", "dept_costs", "customer_orders"], "commands": ["SELECT", "JOIN"], "comments": [], "procedures": [] diff --git a/testdata/oracle/select/select-using-with-clause.json b/testdata/oracle/select/select-using-with-clause.json index 6bb2fc5..629c17f 100644 --- a/testdata/oracle/select/select-using-with-clause.json +++ b/testdata/oracle/select/select-using-with-clause.json @@ -4,8 +4,8 @@ { "expected": "WITH dept_costs AS ( SELECT department_id, SUM ( test_amt ) FROM employees GROUP BY department_id ) SELECT * FROM dept_costs WHERE total_sal > ( SELECT AVG ( total_sal ) FROM dept_costs )", "statement_metadata": { - "size": 25, - "tables": ["employees", "dept_costs"], + "size": 15, + "tables": ["employees"], "commands": ["SELECT"], "comments": [], "procedures": [] diff --git a/testdata/postgresql/delete/delete-with-cte.json b/testdata/postgresql/delete/delete-with-cte.json index e721079..7da4a27 100644 --- a/testdata/postgresql/delete/delete-with-cte.json +++ b/testdata/postgresql/delete/delete-with-cte.json @@ -4,10 +4,9 @@ { "expected": "WITH deleted AS ( DELETE FROM users WHERE last_login < NOW ( ) - INTERVAL ? RETURNING * ) SELECT * FROM deleted", "statement_metadata": { - "size": 24, + "size": 17, "tables": [ - "users", - "deleted" + "users" ], "commands": [ "DELETE", diff --git a/testdata/postgresql/select/common-table-expressions-cte.json b/testdata/postgresql/select/common-table-expressions-cte.json index 34a1eb9..86753e2 100644 --- a/testdata/postgresql/select/common-table-expressions-cte.json +++ b/testdata/postgresql/select/common-table-expressions-cte.json @@ -4,10 +4,9 @@ { "expected": "WITH recursive_subordinates AS ( SELECT id, manager_id FROM employees WHERE id = ? UNION ALL SELECT e.id, e.manager_id FROM employees e INNER JOIN recursive_subordinates rs ON rs.id = e.manager_id ) SELECT * FROM recursive_subordinates", "statement_metadata": { - "size": 41, + "size": 19, "tables": [ - "employees", - "recursive_subordinates" + "employees" ], "commands": [ "SELECT", diff --git a/testdata/postgresql/update/update-with-cte.json b/testdata/postgresql/update/update-with-cte.json index 60fd0c4..0acbd55 100644 --- a/testdata/postgresql/update/update-with-cte.json +++ b/testdata/postgresql/update/update-with-cte.json @@ -4,10 +4,9 @@ { "expected": "WITH updated AS ( UPDATE users SET name = ? WHERE id = ? RETURNING * ) SELECT * FROM updated", "statement_metadata": { - "size": 24, + "size": 17, "tables": [ - "users", - "updated" + "users" ], "commands": [ "UPDATE",