changes for release 0.26.0, bunch of fixes & improvements

xnuinside · Mar 28, 2022 · 3d0ab28 · 3d0ab28
1 parent 89f2868
commit 3d0ab28
Show file tree

Hide file tree

Showing 16 changed files with 556 additions and 43 deletions.
diff --git a/CHANGELOG.txt b/CHANGELOG.txt
@@ -1,3 +1,22 @@
+**v0.26.0**
+Improvements:
+
+1. Added more explicit debug message on Statement errors - https://github.com/xnuinside/simple-ddl-parser/issues/116
+2. Added support for "USING INDEX TABLESPACE" statement in ALTER - https://github.com/xnuinside/simple-ddl-parser/issues/119
+3. Added support for IN statements in CHECKS - https://github.com/xnuinside/simple-ddl-parser/issues/121
+
+New features:
+1. Support SparkSQL USING - https://github.com/xnuinside/simple-ddl-parser/issues/117
+Updates initiated by ticket https://github.com/xnuinside/simple-ddl-parser/issues/120:
+2. In Parser you can use argument json_dump=True in method .run() if you want get result in JSON format. 
+- README updated
+
+Fixes:
+1. Added support for PARTITION BY one column without type
+2. Alter table add constraint PRIMARY KEY - https://github.com/xnuinside/simple-ddl-parser/issues/119
+3. Fix for paring SET statement - https://github.com/xnuinside/simple-ddl-parser/pull/122
+4. Fix for disappeared colums without properties - https://github.com/xnuinside/simple-ddl-parser/issues/123
+
 **v0.25.0**
 ## Fixes:
 

diff --git a/README.md b/README.md
@@ -14,7 +14,21 @@ However, in process of adding support for new statements & features I see that o
 
 
 ### How does it work?
-Parser tested on different DDLs mostly for PostgreSQL & Hive. But idea to support as much as possible DDL dialects (AWS Redshift, Oracle, Hive, MsSQL, BigQuery etc.). You can check dialects sections after `Supported Statements` section to get more information that statements from dialects already supported by parser.
+
+Parser supports: 
+
+- SQL
+- HQL (Hive)
+- MSSQL dialec
+- Oracle dialect
+- MySQL dialect
+- PostgreSQL dialect
+- BigQuery
+- Redshift
+- Snowflake
+- SparkSQL
+
+You can check dialects sections after `Supported Statements` section to get more information that statements from dialects already supported by parser. If you need to add more statements or new dialects - feel free to open the issue. 
 
 ### Feel free to open Issue with DDL sample
 **If you need some statement, that not supported by parser yet**: please provide DDL example & information about that is it SQL dialect or DB.
@@ -170,6 +184,26 @@ You can provide target path where you want to dump result with argument **-t**,
     sdp tests/sql/test_two_tables.sql -t dump_results/
 
 ```
+### Get Output in JSON
+
+If you want to get output in JSON in stdout you can use argument **json_dump=True** in method **.run()** for this
+```python
+    from simple_ddl_parser import DDLParser
+
+
+    parse_results = DDLParser("""create table dev.data_sync_history(
+        data_sync_id bigint not null,
+        sync_count bigint not null,
+    ); """).run(json_dump=True)
+
+    print(parse_results) 
+
+```
+Output will be:
+
+```json
+[{"columns": [{"name": "data_sync_id", "type": "bigint", "size": null, "references": null, "unique": false, "nullable": false, "default": null, "check": null}, {"name": "sync_count", "type": "bigint", "size": null, "references": null, "unique": false, "nullable": false, "default": null, "check": null}], "primary_key": [], "alter": {}, "checks": [], "index": [], "partitioned_by": [], "tablespace": null, "schema": "dev", "table_name": "data_sync_history"}]
+```
 
 ### More details
 
@@ -297,7 +331,7 @@ In output you will have names like 'dbo' and 'TO_Requests', not '[dbo]' and '[TO
 
 - STATEMENTS: PRIMARY KEY, CHECK, FOREIGN KEY in table defenitions (in create table();)
 
-- ALTER TABLE STATEMENTS: ADD CHECK (with CONSTRAINT), ADD FOREIGN KEY (with CONSTRAINT), ADD UNIQUE, ADD DEFAULT FOR, ALTER TABLE ONLY, ALTER TABLE IF EXISTS
+- ALTER TABLE STATEMENTS: ADD CHECK (with CONSTRAINT), ADD FOREIGN KEY (with CONSTRAINT), ADD UNIQUE, ADD DEFAULT FOR, ALTER TABLE ONLY, ALTER TABLE IF EXISTS; ALTER .. PRIMARY KEY; ALTER .. USING INDEX TABLESPACE
 
 - PARTITION BY statement
 
@@ -319,6 +353,11 @@ In output you will have names like 'dbo' and 'TO_Requests', not '[dbo]' and '[TO
 
 - CREATE DATABASE + Properties parsing
 
+### SparkSQL Dialect statements
+
+- USING
+
+
 ### HQL Dialect statements
 
 - PARTITIONED BY statement
@@ -385,6 +424,7 @@ In output you will have names like 'dbo' and 'TO_Requests', not '[dbo]' and '[TO
 
 ### TODO in next Releases (if you don't see feature that you need - open the issue)
 
+-1. Update command line to parse all arguments, that supported by Parser
 0. Add support for ALTER TABLE ... ADD COLUMN
 1. Add more support for CREATE type IS TABLE (example: CREATE OR REPLACE TYPE budget_tbl_typ IS TABLE OF NUMBER(8,2);
 2. Add support (ignore correctly) ALTER TABLE ... DROP CONSTRAINT ..., ALTER TABLE ... DROP INDEX ...

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "simple-ddl-parser"
-version = "0.25.0"
+version = "0.26.0"
 description = "Simple DDL Parser to parse SQL & dialects like HQL, TSQL (MSSQL), Oracle, AWS Redshift, Snowflake, MySQL, PostgreSQL, etc ddl files to json/python dict with full information about columns: types, defaults, primary keys, etc.; sequences, alters, custom types & other entities from ddl."
 authors = ["Iuliia Volkova <[email protected]>"]
 license = "MIT"

diff --git a/simple_ddl_parser/ddl_parser.py b/simple_ddl_parser/ddl_parser.py
@@ -1,5 +1,7 @@
 from typing import Dict, List
 
+from ply.lex import LexToken
+
 from simple_ddl_parser import tokens as tok
 from simple_ddl_parser.dialects.bigquery import BigQuery
 from simple_ddl_parser.dialects.hql import HQL
@@ -8,6 +10,7 @@
 from simple_ddl_parser.dialects.oracle import Oracle
 from simple_ddl_parser.dialects.redshift import Redshift
 from simple_ddl_parser.dialects.snowflake import Snowflake
+from simple_ddl_parser.dialects.spark_sql import SparkSQL
 from simple_ddl_parser.dialects.sql import BaseSQL
 from simple_ddl_parser.parser import Parser
 
@@ -17,13 +20,13 @@ class DDLParserError(Exception):
 
 
 class DDLParser(
-    Parser, Snowflake, BaseSQL, HQL, MySQL, MSSQL, Oracle, Redshift, BigQuery
+    Parser, SparkSQL, Snowflake, BaseSQL, HQL, MySQL, MSSQL, Oracle, Redshift, BigQuery
 ):
 
     tokens = tok.tokens
     t_ignore = "\t  \r"
 
-    def get_tag_symbol_value_and_increment(self, t):
+    def get_tag_symbol_value_and_increment(self, t: LexToken):
         # todo: need to find less hacky way to parse HQL structure types
         if "<" in t.value:
             t.type = "LT"
@@ -33,15 +36,15 @@ def get_tag_symbol_value_and_increment(self, t):
             self.lexer.lt_open -= t.value.count(">")
         return t
 
-    def after_columns_tokens(self, t):
+    def after_columns_tokens(self, t: LexToken):
         t.type = tok.after_columns_tokens.get(t.value.upper(), t.type)
         if t.type != "ID":
             self.lexer.after_columns = True
         elif self.lexer.columns_def:
             t.type = tok.columns_defenition.get(t.value.upper(), t.type)
         return t
 
-    def process_body_tokens(self, t):
+    def process_body_tokens(self, t: LexToken):
         if (
             self.lexer.last_par == "RP" and not self.lexer.lp_open
         ) or self.lexer.after_columns:
@@ -52,7 +55,7 @@ def process_body_tokens(self, t):
             t.type = tok.sequence_reserved.get(t.value.upper(), "ID")
         return t
 
-    def tokens_not_columns_names(self, t):
+    def tokens_not_columns_names(self, t: LexToken):
         if not self.lexer.check:
             for key in tok.symbol_tokens_no_check:
                 if key in t.value:
@@ -78,28 +81,28 @@ def tokens_not_columns_names(self, t):
 
         return t
 
-    def set_lexer_tags(self, t):
+    def set_lexer_tags(self, t: LexToken):
         if t.type == "SEQUENCE":
             self.lexer.sequence = True
         elif t.type == "CHECK":
             self.lexer.check = True
 
-    def t_DOT(self, t):
+    def t_DOT(self, t: LexToken):
         r"\."
         t.type = "DOT"
         return self.set_last_token(t)
 
-    def t_STRING(self, t):
+    def t_STRING(self, t: LexToken):
         r"((\')([a-zA-Z_,`0-9:><\=\-\+.\~\%$\!() {}\[\]\/\\\"\#\*&^|?;±§@~]*)(\')){1}"
         t.type = "STRING"
         return self.set_last_token(t)
 
-    def t_DQ_STRING(self, t):
+    def t_DQ_STRING(self, t: LexToken):
         r"((\")([a-zA-Z_,`0-9:><\=\-\+.\~\%$\!() {}'\[\]\/\\\\#\*&^|?;±§@~]*)(\")){1}"
         t.type = "DQ_STRING"
         return self.set_last_token(t)
 
-    def is_token_column_name(self, t):
+    def is_token_column_name(self, t: LexToken):
         """many of reserved words can be used as column name,
         to decide is it a column name or not we need do some checks"""
         skip_id_tokens = ["(", ")", ","]
@@ -111,28 +114,34 @@ def is_token_column_name(self, t):
             and t.value.upper() not in tok.first_liners
         )
 
-    def is_creation_name(self, t):
+    def is_creation_name(self, t: LexToken):
         """many of reserved words can be used as column name,
         to decide is it a column name or not we need do some checks"""
         skip_id_tokens = ["(", ")", ","]
+        exceptional_keys = [
+            "SCHEMA",
+            "TABLE",
+            "DATABASE",
+            "TYPE",
+            "DOMAIN",
+            "TABLESPACE",
+            "INDEX",
+            "CONSTRAINT",
+            "EXISTS",
+        ]
         return (
             t.value not in skip_id_tokens
             and t.value.upper() not in ["IF"]
-            and self.lexer.last_token
-            in [
-                "SCHEMA",
-                "TABLE",
-                "DATABASE",
-                "TYPE",
-                "DOMAIN",
-                "TABLESPACE",
-                "INDEX",
-                "CONSTRAINT",
-                "EXISTS",
-            ]
+            and self.lexer.last_token in exceptional_keys
+            and not self.exceptional_cases(t.value.upper())
         )
 
-    def t_ID(self, t):
+    def exceptional_cases(self, value: str) -> bool:
+        if value == "TABLESPACE" and self.lexer.last_token == "INDEX":
+            return True
+        return False
+
+    def t_ID(self, t: LexToken):
         r"([0-9]\.[0-9])\w|([a-zA-Z_,0-9:><\/\=\-\+\~\%$\*\()!{}\[\]\`\[\]]+)"
         t.type = tok.symbol_tokens.get(t.value, "ID")
 
@@ -141,7 +150,6 @@ def t_ID(self, t):
             self.lexer.columns_def = True
             self.lexer.last_token = "LP"
             return t
-
         elif self.is_token_column_name(t) or self.lexer.last_token == "DOT":
             t.type = "ID"
         elif t.type != "DQ_STRING" and self.is_creation_name(t):
@@ -156,25 +164,31 @@ def t_ID(self, t):
 
         return self.set_last_token(t)
 
-    def commat_type(self, t):
+    def commat_type(self, t: LexToken):
         if t.type == "COMMA" and self.lexer.lt_open:
             t.type = "COMMAT"
 
-    def capitalize_tokens(self, t):
+    def capitalize_tokens(self, t: LexToken):
         if t.type != "ID" and t.type not in ["LT", "RT"]:
             t.value = t.value.upper()
 
-    def set_lexx_tags(self, t):
+    def set_parathesis_tokens(self, t: LexToken):
         if t.type in ["RP", "LP"]:
             if t.type == "RP" and self.lexer.lp_open:
                 self.lexer.lp_open -= 1
             self.lexer.last_par = t.type
+
+    def set_lexx_tags(self, t: LexToken):
+        self.set_parathesis_tokens(t)
+
+        if t.type == "ALTER":
+            self.lexer.is_alter = True
         elif t.type in ["TYPE", "DOMAIN", "TABLESPACE"]:
             self.lexer.is_table = False
-        elif t.type in ["TABLE", "INDEX"]:
+        elif t.type in ["TABLE", "INDEX"] and not self.lexer.is_alter:
             self.lexer.is_table = True
 
-    def set_last_token(self, t):
+    def set_last_token(self, t: LexToken):
         self.lexer.last_token = t.type
         return t
 
@@ -190,7 +204,7 @@ def p_id(self, p):
                 if p[0].startswith(symbol) and p[0].endswith(delimeters_to_end[num]):
                     p[0] = p[0][1:-1]
 
-    def t_error(self, t):
+    def t_error(self, t: LexToken):
         raise DDLParserError("Unknown symbol %r" % (t.value[0],))
 
     def p_error(self, p):

diff --git a/simple_ddl_parser/dialects/__init__.py b/simple_ddl_parser/dialects/__init__.py
diff --git a/simple_ddl_parser/dialects/hql.py b/simple_ddl_parser/dialects/hql.py
@@ -129,9 +129,10 @@ def p_expression_stored_as(self, p):
             p[0]["stored_as"] = p_list[-1]
 
     def p_expression_partitioned_by_hql(self, p):
-        """expr : expr PARTITIONED BY pid_with_type"""
+        """expr : expr PARTITIONED BY pid_with_type
+        | expr PARTITIONED BY LP pid RP"""
         p[0] = p[1]
-        p_list = list(p)
+        p_list = remove_par(list(p))
         p[0]["partitioned_by"] = p_list[-1]
 
     def p_pid_with_type(self, p):

diff --git a/simple_ddl_parser/dialects/snowflake.py b/simple_ddl_parser/dialects/snowflake.py
@@ -10,6 +10,7 @@ def p_clone(self, p):
     def p_table_properties(self, p):
         """table_properties : id id id"""
         p_list = list(p)
+        print(p_list, "table_properties")
         p[0] = {p_list[-3]: p_list[-1]}
 
     def p_expression_cluster_by(self, p):

diff --git a/simple_ddl_parser/dialects/spark_sql.py b/simple_ddl_parser/dialects/spark_sql.py
@@ -0,0 +1,10 @@
+class SparkSQL:
+    def p_expression_using(self, p):
+        """expr : expr using"""
+        p[0] = p[1]
+        p[1].update(p[2])
+
+    def p_using(self, p):
+        """using : USING id"""
+        p_list = list(p)
+        p[0] = {"using": p_list[-1]}