From a0ba802c83bcb12c733d261fb767955500327d83 Mon Sep 17 00:00:00 2001 From: Zhengda Lu Date: Mon, 4 Mar 2024 11:26:39 -0500 Subject: [PATCH] add snowflake support (#33) --- dbms_test.go | 1 + obfuscate_and_normalize_test.go | 103 ++++++++++++++++++ obfuscator_test.go | 67 ++++++++++++ sqllexer.go | 5 +- sqllexer_utils.go | 7 +- testdata/snowflake/test-cases/data-clone.json | 21 ++++ .../snowflake/test-cases/external-data.json | 19 ++++ testdata/snowflake/test-cases/listagg.json | 19 ++++ .../test-cases/materialized-view.json | 20 ++++ .../semi-structured-data-types.json | 19 ++++ testdata/snowflake/test-cases/stream.json | 19 ++++ testdata/snowflake/test-cases/task.json | 22 ++++ .../snowflake/test-cases/time-travel.json | 19 ++++ .../test-cases/warehouse-controls.json | 18 +++ 14 files changed, 357 insertions(+), 2 deletions(-) create mode 100644 testdata/snowflake/test-cases/data-clone.json create mode 100644 testdata/snowflake/test-cases/external-data.json create mode 100644 testdata/snowflake/test-cases/listagg.json create mode 100644 testdata/snowflake/test-cases/materialized-view.json create mode 100644 testdata/snowflake/test-cases/semi-structured-data-types.json create mode 100644 testdata/snowflake/test-cases/stream.json create mode 100644 testdata/snowflake/test-cases/task.json create mode 100644 testdata/snowflake/test-cases/time-travel.json create mode 100644 testdata/snowflake/test-cases/warehouse-controls.json diff --git a/dbms_test.go b/dbms_test.go index 52d2353..25011f2 100644 --- a/dbms_test.go +++ b/dbms_test.go @@ -38,6 +38,7 @@ func TestQueriesPerDBMS(t *testing.T) { DBMSOracle, DBMSSQLServer, DBMSMySQL, + DBMSSnowflake, } for _, dbms := range dbmsTypes { diff --git a/obfuscate_and_normalize_test.go b/obfuscate_and_normalize_test.go index 729ef9f..7f10eb0 100644 --- a/obfuscate_and_normalize_test.go +++ b/obfuscate_and_normalize_test.go @@ -297,6 +297,109 @@ multiline comment */ WithDBMS(DBMSSQLServer), }, }, + { + input: ` + WITH SILENCES AS ( + SELECT LOWER(BASE_TABLE_NAME), CREATED_DT, SILENCE_UNTIL_DT, REASON + ,ROW_NUMBER() OVER (PARTITION BY LOWER(BASE_TABLE_NAME) ORDER BY CREATED_DT DESC) AS ROW_NUMBER + FROM REPORTING.GENERAL.SOME_TABLE + WHERE CONTAINS('us1', LOWER(DATACENTER_LABEL)) + ) + SELECT * FROM SILENCES WHERE ROW_NUMBER = 1;`, + expected: `WITH SILENCES AS ( SELECT LOWER ( BASE_TABLE_NAME ), CREATED_DT, SILENCE_UNTIL_DT, REASON, ROW_NUMBER ( ) OVER ( PARTITION BY LOWER ( BASE_TABLE_NAME ) ORDER BY CREATED_DT DESC ) FROM REPORTING.GENERAL.SOME_TABLE WHERE CONTAINS ( ?, LOWER ( DATACENTER_LABEL ) ) ) SELECT * FROM SILENCES WHERE ROW_NUMBER = ?`, + statementMetadata: StatementMetadata{ + Tables: []string{"REPORTING.GENERAL.SOME_TABLE", "SILENCES"}, + Comments: []string{}, + Commands: []string{"SELECT"}, + Procedures: []string{}, + Size: 42, + }, + lexerOpts: []lexerOption{ + WithDBMS(DBMSSnowflake), + }, + }, + { + input: `USE WAREHOUSE "SOME_WAREHOUSE";`, + expected: `USE WAREHOUSE SOME_WAREHOUSE`, // double quoted identifier are not replaced + statementMetadata: StatementMetadata{ + Tables: []string{}, + Comments: []string{}, + Commands: []string{"USE"}, + Procedures: []string{}, + Size: 3, + }, + lexerOpts: []lexerOption{ + WithDBMS(DBMSSnowflake), + }, + }, + { + input: `SELECT 1 FROM REPORTING.GENERAL.SOME_RANDOM_TABLE + WHERE BASE_TABLE_NAME='xxx_ttt_zzz_v1' + AND DATACENTER_LABEL='us3' + AND CENSUS_ELEMENT_ID='bef52c3f-788f-4fb3-b116-a05a1c4a9792';`, + expected: `SELECT ? FROM REPORTING.GENERAL.SOME_RANDOM_TABLE WHERE BASE_TABLE_NAME = ? AND DATACENTER_LABEL = ? AND CENSUS_ELEMENT_ID = ?`, + statementMetadata: StatementMetadata{ + Tables: []string{"REPORTING.GENERAL.SOME_RANDOM_TABLE"}, + Comments: []string{}, + Commands: []string{"SELECT"}, + Procedures: []string{}, + Size: 41, + }, + lexerOpts: []lexerOption{ + WithDBMS(DBMSSnowflake), + }, + }, + { + input: `COPY INTO REPORTING.GENERAL.MY_TABLE + (FEATURE,DESCRIPTION,COVERAGE,DATE_PARTITION) + FROM (SELECT $1,$2,$3,TO_TIMESTAMP('2023-12-14 00:00:00') FROM @REPORTING.GENERAL.SOME_DESCRIPTIONS/external_data/) + file_format=(type=CSV SKIP_HEADER=1 FIELD_OPTIONALLY_ENCLOSED_BY='\"' ESCAPE_UNENCLOSED_FIELD='\\' FIELD_DELIMITER=',' ) + ;`, + expected: `COPY INTO REPORTING.GENERAL.MY_TABLE ( FEATURE, DESCRIPTION, COVERAGE, DATE_PARTITION ) FROM ( SELECT $1, $2, $3, TO_TIMESTAMP ( ? ) FROM @REPORTING.GENERAL.SOME_DESCRIPTIONS/external_data/ ) file_format = ( type = CSV SKIP_HEADER = ? FIELD_OPTIONALLY_ENCLOSED_BY = ? ESCAPE_UNENCLOSED_FIELD = ? FIELD_DELIMITER = ? )`, + statementMetadata: StatementMetadata{ + Tables: []string{"REPORTING.GENERAL.MY_TABLE", "@REPORTING.GENERAL.SOME_DESCRIPTIONS/external_data/"}, + Comments: []string{}, + Commands: []string{"SELECT"}, + Procedures: []string{}, + Size: 83, + }, + lexerOpts: []lexerOption{ + WithDBMS(DBMSSnowflake), + }, + }, + { + input: `SELECT EXISTS( + SELECT * FROM REPORTING.INFORMATION_SCHEMA.TABLES + WHERE table_schema='XXX_YYY' + AND table_name='ABC' + AND table_type='EXTERNAL TABLE' + );`, + expected: `SELECT EXISTS ( SELECT * FROM REPORTING.INFORMATION_SCHEMA.TABLES WHERE table_schema = ? AND table_name = ? AND table_type = ? )`, + statementMetadata: StatementMetadata{ + Tables: []string{"REPORTING.INFORMATION_SCHEMA.TABLES"}, + Comments: []string{}, + Commands: []string{"SELECT"}, + Procedures: []string{}, + Size: 41, + }, + lexerOpts: []lexerOption{ + WithDBMS(DBMSSnowflake), + }, + }, + { + input: `ALTER EXTERNAL TABLE REPORTING.TEST.MY_TABLE REFRESH '2024_01_15';`, + expected: `ALTER EXTERNAL TABLE REPORTING.TEST.MY_TABLE REFRESH ?`, + statementMetadata: StatementMetadata{ + Tables: []string{"REPORTING.TEST.MY_TABLE"}, + Comments: []string{}, + Commands: []string{"ALTER"}, + Procedures: []string{}, + Size: 28, + }, + lexerOpts: []lexerOption{ + WithDBMS(DBMSSnowflake), + }, + }, } obfuscator := NewObfuscator( diff --git a/obfuscator_test.go b/obfuscator_test.go index e52d5a9..24880bd 100644 --- a/obfuscator_test.go +++ b/obfuscator_test.go @@ -421,6 +421,73 @@ func TestObfuscator(t *testing.T) { expected: "SELECT * FROM users where id = :id and name = :1", dbms: DBMSOracle, }, + { + input: ` + WITH SILENCES AS ( + SELECT LOWER(BASE_TABLE_NAME), CREATED_DT, SILENCE_UNTIL_DT, REASON + ,ROW_NUMBER() OVER (PARTITION BY LOWER(BASE_TABLE_NAME) ORDER BY CREATED_DT DESC) AS ROW_NUMBER + FROM REPORTING.GENERAL.SOME_TABLE + WHERE CONTAINS('us1', LOWER(DATACENTER_LABEL)) + ) + SELECT * FROM SILENCES WHERE ROW_NUMBER = 1;`, + expected: `WITH SILENCES AS ( + SELECT LOWER(BASE_TABLE_NAME), CREATED_DT, SILENCE_UNTIL_DT, REASON + ,ROW_NUMBER() OVER (PARTITION BY LOWER(BASE_TABLE_NAME) ORDER BY CREATED_DT DESC) AS ROW_NUMBER + FROM REPORTING.GENERAL.SOME_TABLE + WHERE CONTAINS(?, LOWER(DATACENTER_LABEL)) + ) + SELECT * FROM SILENCES WHERE ROW_NUMBER = ?;`, + dbms: DBMSSnowflake, + }, + { + input: `USE WAREHOUSE "SOME_WAREHOUSE";`, + expected: `USE WAREHOUSE "SOME_WAREHOUSE";`, // double quoted identifier are not replaced + dbms: DBMSSnowflake, + }, + { + input: `SELECT 1 FROM REPORTING.GENERAL.SOME_RANDOM_TABLE + WHERE BASE_TABLE_NAME='xxx_ttt_zzz_v1' + AND DATACENTER_LABEL='us3' + AND CENSUS_ELEMENT_ID='bef52c3f-788f-4fb3-b116-a05a1c4a9792';`, + expected: `SELECT ? FROM REPORTING.GENERAL.SOME_RANDOM_TABLE + WHERE BASE_TABLE_NAME=? + AND DATACENTER_LABEL=? + AND CENSUS_ELEMENT_ID=?;`, + dbms: DBMSSnowflake, + }, + { + input: `COPY INTO REPORTING.GENERAL.MY_TABLE + (FEATURE,DESCRIPTION,COVERAGE,DATE_PARTITION) + FROM (SELECT $1,$2,$3,TO_TIMESTAMP('2023-12-14 00:00:00') FROM @REPORTING.GENERAL.SOME_DESCRIPTIONS/external_data/2023/12/14/) + file_format=(type=CSV SKIP_HEADER=1 FIELD_OPTIONALLY_ENCLOSED_BY='\"' ESCAPE_UNENCLOSED_FIELD='\\' FIELD_DELIMITER=',' ) + ;`, + expected: `COPY INTO REPORTING.GENERAL.MY_TABLE + (FEATURE,DESCRIPTION,COVERAGE,DATE_PARTITION) + FROM (SELECT $1,$2,$3,TO_TIMESTAMP(?) FROM @REPORTING.GENERAL.SOME_DESCRIPTIONS/external_data/2023/12/14/) + file_format=(type=CSV SKIP_HEADER=? FIELD_OPTIONALLY_ENCLOSED_BY=? ESCAPE_UNENCLOSED_FIELD=? FIELD_DELIMITER=? ) + ;`, + dbms: DBMSSnowflake, + }, + { + input: `SELECT EXISTS( + SELECT * FROM REPORTING.INFORMATION_SCHEMA.TABLES + WHERE table_schema='XXX_YYY' + AND table_name='ABC' + AND table_type='EXTERNAL TABLE' + );`, + expected: `SELECT EXISTS( + SELECT * FROM REPORTING.INFORMATION_SCHEMA.TABLES + WHERE table_schema=? + AND table_name=? + AND table_type=? + );`, + dbms: DBMSSnowflake, + }, + { + input: `ALTER EXTERNAL TABLE REPORTING.TEST.MY_TABLE REFRESH '2024_01_15';`, + expected: `ALTER EXTERNAL TABLE REPORTING.TEST.MY_TABLE REFRESH ?;`, + dbms: DBMSSnowflake, + }, } for _, tt := range tests { diff --git a/sqllexer.go b/sqllexer.go index a9925c9..f092622 100644 --- a/sqllexer.go +++ b/sqllexer.go @@ -153,6 +153,9 @@ func (s *Lexer) Scan() Token { fallthrough case ch == '@': if isAlphaNumeric(s.lookAhead(1)) { + if s.config.DBMS == DBMSSnowflake { + return s.scanIdentifier(ch) + } return s.scanBindParameter() } else if s.lookAhead(1) == '@' { return s.scanSystemVariable() @@ -314,7 +317,7 @@ func (s *Lexer) scanIdentifier(ch rune) Token { // NOTE: this func does not distinguish between SQL keywords and identifiers s.start = s.cursor ch = s.nextBy(utf8.RuneLen(ch)) - for isLetter(ch) || isDigit(ch) || ch == '.' || ch == '?' || ch == '$' || ch == '#' { + for isLetter(ch) || isDigit(ch) || ch == '.' || ch == '?' || ch == '$' || ch == '#' || ch == '/' { ch = s.nextBy(utf8.RuneLen(ch)) } if ch == '(' { diff --git a/sqllexer_utils.go b/sqllexer_utils.go index 7283872..f694ce7 100644 --- a/sqllexer_utils.go +++ b/sqllexer_utils.go @@ -8,7 +8,7 @@ import ( type DBMSType string const ( - // DBMSSQLServer is a MS SQL Server + // DBMSSQLServer is a MS SQL DBMSSQLServer DBMSType = "mssql" // DBMSPostgres is a PostgreSQL Server DBMSPostgres DBMSType = "postgresql" @@ -16,6 +16,8 @@ const ( DBMSMySQL DBMSType = "mysql" // DBMSOracle is a Oracle Server DBMSOracle DBMSType = "oracle" + // DBMSSnowflake is a Snowflake Server + DBMSSnowflake DBMSType = "snowflake" ) var commands = map[string]bool{ @@ -37,6 +39,8 @@ var commands = map[string]bool{ "EXEC": true, "EXPLAIN": true, "STRAIGHT_JOIN": true, + "USE": true, + "CLONE": true, } var tableIndicators = map[string]bool{ @@ -46,6 +50,7 @@ var tableIndicators = map[string]bool{ "UPDATE": true, "TABLE": true, "STRAIGHT_JOIN": true, // MySQL + "CLONE": true, // Snowflake } var keywords = map[string]bool{ diff --git a/testdata/snowflake/test-cases/data-clone.json b/testdata/snowflake/test-cases/data-clone.json new file mode 100644 index 0000000..fa93a67 --- /dev/null +++ b/testdata/snowflake/test-cases/data-clone.json @@ -0,0 +1,21 @@ +{ + "input": "CREATE TABLE new_table CLONE existing_table;", + "outputs": [ + { + "expected": "CREATE TABLE new_table CLONE existing_table", + "statement_metadata": { + "size": 34, + "tables": [ + "new_table", + "existing_table" + ], + "commands": [ + "CREATE", + "CLONE" + ], + "comments": [], + "procedures": [] + } + } + ] + } \ No newline at end of file diff --git a/testdata/snowflake/test-cases/external-data.json b/testdata/snowflake/test-cases/external-data.json new file mode 100644 index 0000000..931a73d --- /dev/null +++ b/testdata/snowflake/test-cases/external-data.json @@ -0,0 +1,19 @@ +{ + "input": "CREATE EXTERNAL TABLE ext_sales_data (sale_date DATE, product_id STRING, quantity_sold NUMBER) WITH LOCATION = @my_external_stage/sales_data/ FILE_FORMAT = (TYPE = 'CSV' FIELD_OPTIONALLY_ENCLOSED_BY = '\"');", + "outputs": [ + { + "expected": "CREATE EXTERNAL TABLE ext_sales_data ( sale_date DATE, product_id STRING, quantity_sold NUMBER ) WITH LOCATION = @my_external_stage/sales_data/ FILE_FORMAT = ( TYPE = ? FIELD_OPTIONALLY_ENCLOSED_BY = ? )", + "statement_metadata": { + "size": 20, + "tables": [ + "ext_sales_data" + ], + "commands": [ + "CREATE" + ], + "comments": [], + "procedures": [] + } + } + ] + } \ No newline at end of file diff --git a/testdata/snowflake/test-cases/listagg.json b/testdata/snowflake/test-cases/listagg.json new file mode 100644 index 0000000..0c10680 --- /dev/null +++ b/testdata/snowflake/test-cases/listagg.json @@ -0,0 +1,19 @@ +{ + "input": "SELECT LISTAGG(product_name, ', ') WITHIN GROUP (ORDER BY product_name) AS product_list FROM products WHERE category_id = 1;", + "outputs": [ + { + "expected": "SELECT LISTAGG ( product_name, ? ) WITHIN GROUP ( ORDER BY product_name ) FROM products WHERE category_id = ?", + "statement_metadata": { + "size": 14, + "tables": [ + "products" + ], + "commands": [ + "SELECT" + ], + "comments": [], + "procedures": [] + } + } + ] + } \ No newline at end of file diff --git a/testdata/snowflake/test-cases/materialized-view.json b/testdata/snowflake/test-cases/materialized-view.json new file mode 100644 index 0000000..3dbbfe4 --- /dev/null +++ b/testdata/snowflake/test-cases/materialized-view.json @@ -0,0 +1,20 @@ +{ + "input": "CREATE MATERIALIZED VIEW mv_product_sales AS SELECT product_id, SUM(sales_amount) AS total_sales FROM sales GROUP BY product_id;", + "outputs": [ + { + "expected": "CREATE MATERIALIZED VIEW mv_product_sales AS SELECT product_id, SUM ( sales_amount ) FROM sales GROUP BY product_id", + "statement_metadata": { + "size": 17, + "tables": [ + "sales" + ], + "commands": [ + "CREATE", + "SELECT" + ], + "comments": [], + "procedures": [] + } + } + ] + } \ No newline at end of file diff --git a/testdata/snowflake/test-cases/semi-structured-data-types.json b/testdata/snowflake/test-cases/semi-structured-data-types.json new file mode 100644 index 0000000..3153b66 --- /dev/null +++ b/testdata/snowflake/test-cases/semi-structured-data-types.json @@ -0,0 +1,19 @@ +{ + "input": "SELECT metadata:customerID::string AS customer_id FROM orders WHERE metadata:orderDate::date = '2023-01-01';", + "outputs": [ + { + "expected": "SELECT metadata : customerID :: string FROM orders WHERE metadata : orderDate :: date = ?", + "statement_metadata": { + "size": 12, + "tables": [ + "orders" + ], + "commands": [ + "SELECT" + ], + "comments": [], + "procedures": [] + } + } + ] + } \ No newline at end of file diff --git a/testdata/snowflake/test-cases/stream.json b/testdata/snowflake/test-cases/stream.json new file mode 100644 index 0000000..6b9aa76 --- /dev/null +++ b/testdata/snowflake/test-cases/stream.json @@ -0,0 +1,19 @@ +{ + "input": "CREATE STREAM my_stream ON TABLE my_table;", + "outputs": [ + { + "expected": "CREATE STREAM my_stream ON TABLE my_table", + "statement_metadata": { + "size": 14, + "tables": [ + "my_table" + ], + "commands": [ + "CREATE" + ], + "comments": [], + "procedures": [] + } + } + ] + } \ No newline at end of file diff --git a/testdata/snowflake/test-cases/task.json b/testdata/snowflake/test-cases/task.json new file mode 100644 index 0000000..3f2b13a --- /dev/null +++ b/testdata/snowflake/test-cases/task.json @@ -0,0 +1,22 @@ +{ + "input": "CREATE TASK /* my comment */ my_task WAREHOUSE = my_warehouse SCHEDULE = '15 MINUTE' AS INSERT INTO summary_table SELECT * FROM new_data_view;", + "outputs": [ + { + "expected": "CREATE TASK my_task WAREHOUSE = my_warehouse SCHEDULE = ? AS INSERT INTO summary_table SELECT * FROM new_data_view", + "statement_metadata": { + "size": 60, + "tables": [ + "summary_table", + "new_data_view" + ], + "commands": [ + "CREATE", + "INSERT", + "SELECT" + ], + "comments": ["/* my comment */"], + "procedures": [] + } + } + ] + } \ No newline at end of file diff --git a/testdata/snowflake/test-cases/time-travel.json b/testdata/snowflake/test-cases/time-travel.json new file mode 100644 index 0000000..45415f3 --- /dev/null +++ b/testdata/snowflake/test-cases/time-travel.json @@ -0,0 +1,19 @@ +{ + "input": "SELECT * FROM my_table AT (TIMESTAMP => '2023-03-15 14:30:00');", + "outputs": [ + { + "expected": "SELECT * FROM my_table AT ( TIMESTAMP => ? )", + "statement_metadata": { + "size": 14, + "tables": [ + "my_table" + ], + "commands": [ + "SELECT" + ], + "comments": [], + "procedures": [] + } + } + ] + } \ No newline at end of file diff --git a/testdata/snowflake/test-cases/warehouse-controls.json b/testdata/snowflake/test-cases/warehouse-controls.json new file mode 100644 index 0000000..300cd2f --- /dev/null +++ b/testdata/snowflake/test-cases/warehouse-controls.json @@ -0,0 +1,18 @@ +{ + "input": "ALTER WAREHOUSE my_warehouse SET WAREHOUSE_SIZE = 'X-LARGE';", + "outputs": [ + { + "expected": "ALTER WAREHOUSE my_warehouse SET WAREHOUSE_SIZE = ?", + "statement_metadata": { + "size": 5, + "tables": [ + ], + "commands": [ + "ALTER" + ], + "comments": [], + "procedures": [] + } + } + ] + } \ No newline at end of file