Skip to content

Commit

Permalink
parse validaly comment statement after schema & table (#129)
Browse files Browse the repository at this point in the history
  • Loading branch information
xnuinside authored May 6, 2022
1 parent 60f586d commit 50c53af
Show file tree
Hide file tree
Showing 14 changed files with 176 additions and 20 deletions.
2 changes: 1 addition & 1 deletion .flake8
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[flake8]
exclude = .github,.git,__pycache__,docs/source/conf.py,old,build,dist,simple_ddl_parser/parsetab.py,./test.py,simple_ddl_parser/test.py
exclude = .github,.git,__pycache__,docs/source/conf.py,old,build,dist,tests/,simple_ddl_parser/parsetab.py,./test.py,simple_ddl_parser/test.py
max-complexity = 10
max-line-length = 120
ignore = W503, E999
11 changes: 11 additions & 0 deletions CHANGELOG.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,14 @@
**v0.26.2**

Fixes:
1. Fixed a huge bug for incorrect parsing lines with 'USE' & 'GO' strings inside.
2. Fixed parsing for CREATE SCHEMA for Snowlake & Oracle DDLs

Improvements:
1. Added COMMENT statement for CREATE TABLE ddl (for SNOWFLAKE dialect support)
2. Added COMMENT statement for CREATE SCHEMA ddl (for SNOWFLAKE dialect support)


**v0.26.1**

Fixes:
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -414,6 +414,7 @@ In output you will have names like 'dbo' and 'TO_Requests', not '[dbo]' and '[TO
- CREATE .. CLONE statements for table, database and schema
- CREATE TABLE .. CLUSTER BY ..
- CONSTRAINT .. [NOT] ENFORCED
- COMMENT = in CREATE TABLE & CREATE SCHEMA statements

### BigQuery

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "simple-ddl-parser"
version = "0.26.1"
version = "0.26.2"
description = "Simple DDL Parser to parse SQL & dialects like HQL, TSQL (MSSQL), Oracle, AWS Redshift, Snowflake, MySQL, PostgreSQL, etc ddl files to json/python dict with full information about columns: types, defaults, primary keys, etc.; sequences, alters, custom types & other entities from ddl."
authors = ["Iuliia Volkova <[email protected]>"]
license = "MIT"
Expand Down
1 change: 1 addition & 0 deletions simple_ddl_parser/ddl_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,7 @@ def set_lexx_tags(self, t: LexToken):

def set_last_token(self, t: LexToken):
self.lexer.last_token = t.type

return t

def p_id(self, p):
Expand Down
18 changes: 18 additions & 0 deletions simple_ddl_parser/dialects/snowflake.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,21 @@ def p_expression_cluster_by(self, p):
p[0] = p[1]
p_list = remove_par(list(p))
p[0]["cluster_by"] = p_list[-1]

def p_table_comment(self, p):
"""expr : expr option_comment
"""
p[0] = p[1]
if p[2]:
p[0].update(p[2])

def p_option_comment(self, p):
"""option_comment : ID STRING
| ID DQ_STRING
| COMMENT ID STRING
| COMMENT ID DQ_STRING
"""
p_list = remove_par(list(p))
print(p_list)
if "comment" in p[1].lower():
p[0] = {"comment": p_list[-1]}
8 changes: 8 additions & 0 deletions simple_ddl_parser/dialects/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,7 @@ def set_auth_property_in_schema(self, p: List, p_list: List) -> None:
def p_c_schema(self, p: List) -> None:
"""c_schema : CREATE SCHEMA
| CREATE ID SCHEMA"""

if len(p) == 4:
p[0] = {"remote": True}

Expand All @@ -424,16 +425,23 @@ def p_create_schema(self, p: List) -> None:
| c_schema id id id
| c_schema id
| c_schema id DOT id
| c_schema id option_comment
| c_schema id DOT id option_comment
| c_schema IF NOT EXISTS id
| c_schema IF NOT EXISTS id DOT id
| create_schema id id id
| create_schema id id STRING
| create_schema options
"""
p_list = list(p)

p[0] = {}
auth_index = None

if "comment" in p_list[-1]:
p[0].update(p_list[-1])
del p_list[-1]

self.add_if_not_exists(p[0], p_list)
if isinstance(p_list[1], dict):
p[0] = p_list[1]
Expand Down
17 changes: 10 additions & 7 deletions simple_ddl_parser/output/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,19 +145,22 @@ def process_alter_and_index_result(

def process_entities(tables_dict: Dict, table: Dict, output_mode: str) -> Dict:
"""process tables, types, sequence and etc. data"""
table_data = init_table_data()
table_data = d.populate_dialects_table_data(output_mode, table_data)
not_table = False
is_it_table = True

if table.get("table_name"):
table_data = init_table_data()
table_data = d.populate_dialects_table_data(output_mode, table_data)
table_data.update(table)
table_data = set_unique_columns(table_data)
else:
table_data = table
not_table = True
if not not_table:
table_data = process_not_table_item(table_data, tables_dict)
is_it_table = False

if is_it_table:
table_data = process_is_it_table_item(table_data, tables_dict)

table_data = normalize_ref_columns_in_final_output(table_data)

d.dialects_clean_up(output_mode, table_data)
return table_data

Expand All @@ -183,7 +186,7 @@ def result_format(
return final_result


def process_not_table_item(table_data: Dict, tables_dict: Dict) -> Dict:
def process_is_it_table_item(table_data: Dict, tables_dict: Dict) -> Dict:
if table_data.get("table_name"):
tables_dict[(table_data["table_name"], table_data["schema"])] = table_data
else:
Expand Down
8 changes: 4 additions & 4 deletions simple_ddl_parser/output/dialects.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
"fields_terminated_by",
"collection_items_terminated_by",
"map_keys_terminated_by",
"comment",
]


Expand Down Expand Up @@ -145,16 +144,17 @@ def dialects_clean_up(output_mode: str, table_data: Dict) -> Dict:
key_cleaning(table_data, output_mode)
update_mappers_for_table_properties = {"bigquery": update_bigquery_output}
update_table_prop = update_mappers_for_table_properties.get(output_mode)

if update_table_prop:
table_data = update_table_prop(table_data)

if output_mode == "oracle":
for column in table_data["columns"]:
for column in table_data.get("columns", []):
column = add_additional_oracle_keys_in_column(column)
elif output_mode == "snowflake":
for column in table_data["columns"]:
# can be no columns if it is a create database or create schema
for column in table_data.get("columns", []):
column = add_additional_snowflake_keys_in_column(column)

elif output_mode == "redshift":
table_data = process_redshift_dialect(table_data)
return table_data
13 changes: 6 additions & 7 deletions simple_ddl_parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,16 +163,16 @@ def check_new_statement_start(self, line: str) -> bool:
return self.new_statement

def check_line_on_skip_words(self) -> bool:
skip_line_words = ["USE", "GO"]
skip_regex = r"^(GO|USE)\b"

self.skip = False
for word in skip_line_words:
if self.line.startswith(word):
self.skip = True
break

if re.match(skip_regex, self.line.upper()):
self.skip = True
return self.skip

def add_line_to_statement(self) -> str:

if (
self.line
and not self.skip
Expand Down Expand Up @@ -206,15 +206,13 @@ def process_line(
self.pre_process_line()

self.line = self.line.strip().replace("\n", "").replace("\t", "")

self.skip = self.check_line_on_skip_words()

self.parse_set_statement()
# to avoid issues when comma or parath are glued to column name
self.check_new_statement_start(self.line)

final_line = self.line.endswith(";") and not self.set_was_in_line

self.add_line_to_statement()

if final_line or self.new_statement:
Expand All @@ -237,6 +235,7 @@ def process_statement(self) -> None:
self.statement = None

def parse_statement(self) -> None:

_parse_result = yacc.parse(self.statement)
if _parse_result:
self.tables.append(_parse_result)
Expand Down
35 changes: 35 additions & 0 deletions tests/non_statement_tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,3 +211,38 @@ def test_flag_normalize_names_mixed_usage():
'ddl_properties': []
}
assert expected == result


def test_parsing_go_and_use_correctly():
ddl="""
create TABLE ASIN.EXCLUSION (
USER_COMMENT VARCHAR(100),
);
"""
result = DDLParser(ddl, normalize_names=True).run(output_mode="hql")
expected = [{'alter': {},
'checks': [],
'collection_items_terminated_by': None,
'columns': [{'check': None,
'default': None,
'name': 'USER_COMMENT',
'nullable': True,
'references': None,
'size': 100,
'type': 'VARCHAR',
'unique': False}],
'comment': None,
'external': False,
'fields_terminated_by': None,
'index': [],
'lines_terminated_by': None,
'location': None,
'map_keys_terminated_by': None,
'partitioned_by': [],
'primary_key': [],
'row_format': None,
'schema': 'ASIN',
'stored_as': None,
'table_name': 'EXCLUSION',
'tablespace': None}]
assert expected == result
1 change: 1 addition & 0 deletions tests/test_simple_ddl_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -1234,6 +1234,7 @@ def test_comments_in_columns():
"tablespace": None,
"schema": None,
"table_name": "test_table",
"comment": "'This is test table'",
}
],
"types": [],
Expand Down
78 changes: 78 additions & 0 deletions tests/test_snowflake.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,3 +184,81 @@ def test_enforced():
"types": [],
}
assert expected == result


def test_table_comment_parsed_validly():

ddl="""
create TABLE ASIN.EXCLUSION (
USER_COMMENT VARCHAR(100),
PROCESS_SQN NUMBER(10,0) NOT NULL,
constraint PK_EXCLUSION primary key (ASIN)
) COMMENT ='ASINs to be excluded from the ASIN List File'
;
"""
result_one = DDLParser(ddl,normalize_names=True).run(output_mode="snowflake")


ddl="""
create TABLE ASIN.EXCLUSION (
USER_COMMENT VARCHAR(100),
PROCESS_SQN NUMBER(10,0) NOT NULL,
constraint PK_EXCLUSION primary key (ASIN)
) COMMENT='ASINs to be excluded from the ASIN List File'
;
"""
result_two = DDLParser(ddl,normalize_names=True).run(output_mode="snowflake")

expected = [{'alter': {},
'checks': [],
'clone': None,
'columns': [{'check': None,
'default': None,
'name': 'USER_COMMENT',
'nullable': True,
'references': None,
'size': 100,
'type': 'VARCHAR',
'unique': False},
{'check': None,
'default': None,
'name': 'PROCESS_SQN',
'nullable': False,
'references': None,
'size': (10, 0),
'type': 'NUMBER',
'unique': False}],
'constraints': {'primary_keys': [{'columns': ['ASIN'],
'constraint_name': 'PK_EXCLUSION'}]},
'comment': "'ASINs to be excluded from the ASIN List File'",
'index': [],
'partitioned_by': [],
'primary_key': ['ASIN'],
'primary_key_enforced': None,
'schema': 'ASIN',
'table_name': 'EXCLUSION',
'tablespace': None}]

assert expected == result_one == result_two


def test_schema_parsed_normally():

ddl="""
create schema my_schema;
"""
result = DDLParser(ddl,normalize_names=True).run(output_mode="snowflake")

expected = [{'schema_name': 'my_schema'}]

assert result == expected


def test_comment_on_create_schema():

ddl="""
create schema my_schema comment='this is comment1';
"""
result = DDLParser(ddl,normalize_names=True).run(output_mode="snowflake")
expected = [{'comment': "'this is comment1'", 'schema_name': 'my_schema'}]
assert result == expected
1 change: 1 addition & 0 deletions tests/test_spark_sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def test_spark_sql_using():
"tablespace": None,
"tblproperties": {"'foo'": "'bar'"},
"using": "CSV",
'comment': "'this is a comment'",
}
],
"types": [],
Expand Down

0 comments on commit 50c53af

Please sign in to comment.