From 37960473a6c4d385eae090d0cf0e585447442a58 Mon Sep 17 00:00:00 2001 From: Rudolf Cardinal Date: Wed, 8 Jan 2025 17:08:41 +0000 Subject: [PATCH] Add Databricks support --- cardinal_pythonlib/sql/validation.py | 44 +++++++--- cardinal_pythonlib/sqlalchemy/dialect.py | 3 + cardinal_pythonlib/sqlalchemy/schema.py | 87 ++++++++++++++++--- ...t_sql_string_coltype_to_sqlalchemy_type.py | 52 +++++++++++ 4 files changed, 162 insertions(+), 24 deletions(-) create mode 100644 dev_notes/convert_sql_string_coltype_to_sqlalchemy_type.py diff --git a/cardinal_pythonlib/sql/validation.py b/cardinal_pythonlib/sql/validation.py index b5f5825..1fb040d 100644 --- a/cardinal_pythonlib/sql/validation.py +++ b/cardinal_pythonlib/sql/validation.py @@ -24,8 +24,6 @@ **Functions to check table/column names etc. for validity in SQL.** -This is a slight - """ import re @@ -41,13 +39,29 @@ # ... SQL Server is very liberal! -# - ANSI: http://jakewheat.github.io/sql-overview/sql-2011-foundation-grammar.html#predefined-type # noqa: E501 +# - ANSI: +# - http://jakewheat.github.io/sql-overview/sql-2011-foundation-grammar.html#predefined-type # noqa: E501 +# # - SQL Server: # - https://support.microsoft.com/en-us/office/equivalent-ansi-sql-data-types-7a0a6bef-ef25-45f9-8a9a-3c5f21b5c65d # noqa: E501 # - https://docs.microsoft.com/en-us/sql/t-sql/data-types/data-types-transact-sql?view=sql-server-ver15 # noqa: E501 +# - https://learn.microsoft.com/en-us/sql/t-sql/data-types/data-types-transact-sql?view=sql-server-ver16 # noqa: E501 # - Note that ANSI "BIT" is SQL Server "BINARY". -# - MySQL: https://dev.mysql.com/doc/refman/8.0/en/data-types.html -# - PostgreSQL: https://www.postgresql.org/docs/9.5/datatype.html +# +# - MySQL: +# - https://dev.mysql.com/doc/refman/8.0/en/data-types.html +# - https://dev.mysql.com/doc/refman/9.1/en/data-types.html +# +# - PostgreSQL: +# - https://www.postgresql.org/docs/9.5/datatype.html +# +# - SQLite: +# - https://www.sqlite.org/datatype3.html +# +# - Databricks: +# - https://github.com/databricks/databricks-sqlalchemy + +SQLTYPE_DATE = "DATE" # ANSI SQLTYPES_INTEGER = ( "BIGINT", # ANSI @@ -71,6 +85,12 @@ "SMALLSERIAL", # PostgreSQL "TINYINT", # SQL Server, MySQL ) +SQLTYPES_BIT = ( + "BIT VARYING", # ANSI + "BIT", # ANSI + "BOOL", # MySQL synonym for BOOLEAN or TINYINT(1) + "BOOLEAN", # ANSI +) SQLTYPES_FLOAT = ( "DOUBLE PRECISION", # ANSI (8 bytes) "DOUBLE", # SQL Server, MySQL; synonym for DOUBLE PRECISION @@ -84,16 +104,13 @@ "SINGLE", # SQL Server ) SQLTYPES_OTHER_NUMERIC = ( - "BIT VARYING", # ANSI - "BIT", # ANSI - "BOOL", # MySQL synonym for BOOLEAN or TINYINT(1) - "BOOLEAN", # ANSI "DEC", # ANSI; synonym for DECIMAL "DECIMAL", # ANSI "FIXED", # MySQL; synonym for DECIMAL "LOGICAL", # SQL Server "LOGICAL1", # SQL Server "NUMERIC", # ANSI; synonym for DECIMAL + "SMALLMONEY", # SQL Server "ROWVERSION", # SQL Server "VARBIT", # PostgreSQL synonym for BIT VARYING "YESNO", # SQL Server @@ -125,8 +142,8 @@ "NTEXT", # SQL Server "NVARCHAR", # SQL Server "SET", # MySQL - "STRING", # SQL Server - "TEXT", # SQL Server, MySQL + "STRING", # SQL Server, Databricks + "TEXT", # SQL Server, MySQL, SQLite "TINYTEXT", # MySQL "VARCHAR", # ANSI ) @@ -146,12 +163,13 @@ "VARBINARY", # ANSI ) SQLTYPES_WITH_DATE = ( - "DATE", # ANSI - "DATETIME", # SQL Server, MySQL + SQLTYPE_DATE, # ANSI + "DATETIME", # SQL Server, MySQL, most "DATETIME2", # SQL Server "DATETIMEOFFSET", # SQL Server (date + time + time zone) "SMALLDATETIME", # SQL Server "TIMESTAMP", # ANSI + "TIMESTAMP_NTZ", # Databricks ) SQLTYPES_DATETIME_OTHER = ( "INTERVAL", # ANSI (not always supported); PostgreSQL diff --git a/cardinal_pythonlib/sqlalchemy/dialect.py b/cardinal_pythonlib/sqlalchemy/dialect.py index 8d0faab..76f6c0a 100644 --- a/cardinal_pythonlib/sqlalchemy/dialect.py +++ b/cardinal_pythonlib/sqlalchemy/dialect.py @@ -56,6 +56,9 @@ class SqlaDialectName(object): SYBASE = "sybase" # Additional third-party dialects: + # - https://docs.sqlalchemy.org/en/20/dialects/ + # Interface: + # - https://docs.sqlalchemy.org/en/20/core/internals.html#sqlalchemy.engine.Dialect # noqa: E501 DATABRICKS = "databricks" # ... https://github.com/databricks/databricks-sqlalchemy diff --git a/cardinal_pythonlib/sqlalchemy/schema.py b/cardinal_pythonlib/sqlalchemy/schema.py index 3e71166..77b9fb1 100644 --- a/cardinal_pythonlib/sqlalchemy/schema.py +++ b/cardinal_pythonlib/sqlalchemy/schema.py @@ -25,6 +25,9 @@ **Functions to work with SQLAlchemy schemas (schemata) directly, via SQLAlchemy Core.** +Functions that have to work with specific dialect information are marked +DIALECT-AWARE. + """ import ast @@ -60,7 +63,19 @@ ) from sqlalchemy.sql import sqltypes, text from sqlalchemy.sql.ddl import DDLElement -from sqlalchemy.sql.sqltypes import BigInteger, TypeEngine +from sqlalchemy.sql.sqltypes import ( + BigInteger, + Boolean, + Date, + DateTime, + Double, + Float, + Integer, + Numeric, + SmallInteger, + Text, + TypeEngine, +) from sqlalchemy.sql.visitors import Visitable from cardinal_pythonlib.logs import get_brace_style_log_with_null_handler @@ -86,6 +101,23 @@ MSSQL_DEFAULT_SCHEMA = "dbo" POSTGRES_DEFAULT_SCHEMA = "public" +DATABRICKS_SQLCOLTYPE_TO_SQLALCHEMY_GENERIC = { + # A bit nasty: https://github.com/databricks/databricks-sqlalchemy + # Part of the reverse mapping is via + # from databricks.sqlalchemy import DatabricksDialect + # print(DatabricksDialect.colspecs) + "BIGINT": BigInteger, + "BOOLEAN": Boolean, + "DATE": Date, + "TIMESTAMP_NTZ": DateTime, + "DOUBLE": Double, + "FLOAT": Float, + "INT": Integer, + "DECIMAL": Numeric, + "SMALLINT": SmallInteger, + "STRING": Text, +} + # ============================================================================= # Inspect tables (SQLAlchemy Core) @@ -498,6 +530,8 @@ def add_index( The table name is worked out from the :class:`Column` object. + DIALECT-AWARE. + Args: engine: SQLAlchemy :class:`Engine` object sqla_column: single column to index @@ -733,6 +767,8 @@ def giant_text_sqltype(dialect: Dialect) -> str: Returns the SQL column type used to make very large text columns for a given dialect. + DIALECT-AWARE. + Args: dialect: a SQLAlchemy :class:`Dialect` Returns: @@ -755,6 +791,9 @@ def giant_text_sqltype(dialect: Dialect) -> str: elif dname == SqlaDialectName.SQLITE: return "TEXT" # https://www.sqlite.org/datatype3.html + elif dname == SqlaDialectName.DATABRICKS: + return "STRING" + # https://github.com/databricks/databricks-sqlalchemy else: raise ValueError(f"Unknown dialect: {dname}") @@ -787,16 +826,40 @@ def _get_sqla_coltype_class_from_str( Returns the SQLAlchemy class corresponding to a particular SQL column type in a given dialect. + DIALECT-AWARE. + Performs an upper- and lower-case search. For example, the SQLite dialect uses upper case, and the MySQL dialect uses lower case. + + For exploratory thinking, see + dev_notes/convert_sql_string_coltype_to_sqlalchemy_type.py. + + DISCUSSION AT: https://github.com/sqlalchemy/sqlalchemy/discussions/12230 """ - # noinspection PyUnresolvedReferences - ischema_names = dialect.ischema_names - try: - return ischema_names[coltype.upper()] - except KeyError: - return ischema_names[coltype.lower()] + if hasattr(dialect, "ischema_names"): + # The built-in dialects all have this, even though it's an internal + # detail. + ischema_names = dialect.ischema_names + try: + return ischema_names[coltype.upper()] + except KeyError: + return ischema_names[coltype.lower()] + elif dialect.name == SqlaDialectName.DATABRICKS: + # Ugly hack. + # Databricks is an example that doesn't have ischema_names. + try: + return DATABRICKS_SQLCOLTYPE_TO_SQLALCHEMY_GENERIC[coltype.upper()] + except KeyError: + raise ValueError( + f"Don't know how to convert SQL column type {coltype!r} " + f"to SQLAlchemy dialect {dialect!r}" + ) + else: + raise ValueError( + f"Don't know a generic way to convert SQL column types " + f"(in text format) to SQLAlchemy dialect {dialect.name!r}. " + ) def get_list_of_sql_string_literals_from_quoted_csv(x: str) -> List[str]: @@ -830,6 +893,8 @@ def get_sqla_coltype_from_dialect_str( ``coltype.compile()`` or ``coltype.compile(dialect)``; see :class:`TypeEngine`. + DIALECT-AWARE. + Args: dialect: a SQLAlchemy :class:`Dialect` class @@ -999,6 +1064,8 @@ def convert_sqla_type_for_dialect( """ Converts an SQLAlchemy column type from one SQL dialect to another. + DIALECT-AWARE. + Args: coltype: SQLAlchemy column type in the source dialect @@ -1024,9 +1091,7 @@ def convert_sqla_type_for_dialect( """ assert coltype is not None - # noinspection PyUnresolvedReferences to_mysql = dialect.name == SqlaDialectName.MYSQL - # noinspection PyUnresolvedReferences to_mssql = dialect.name == SqlaDialectName.MSSQL typeclass = type(coltype) @@ -1201,10 +1266,10 @@ def does_sqlatype_require_index_len( # ============================================================================= -# hack_in_mssql_xml_type: +# hack_in_mssql_xml_type +# ============================================================================= # # Removed, as mssql.base.ischema_names["xml"] is now defined. -# ============================================================================= # ============================================================================= diff --git a/dev_notes/convert_sql_string_coltype_to_sqlalchemy_type.py b/dev_notes/convert_sql_string_coltype_to_sqlalchemy_type.py new file mode 100644 index 0000000..c851375 --- /dev/null +++ b/dev_notes/convert_sql_string_coltype_to_sqlalchemy_type.py @@ -0,0 +1,52 @@ +# EXPLORATORY CODE ONLY. +# +# PROBLEM: Take a SQL string fragment representing a column type (e.g. +# "VARCHAR(32)", "STRING") and an SQLAlchemy dialect (a core one like mysql or +# sqlite, or a third-party one like databricks), and return the appropriate +# SQLAlchemy type as a TypeEngine class/instance. +# +# CURRENT IMPLEMENTATION: +# cardinal_pythonlib.sqlalchemy.schema.get_sqla_coltype_from_dialect_str() +# ... with its sub-function, _get_sqla_coltype_class_from_str() +# +# DISCUSSION AT: https://github.com/sqlalchemy/sqlalchemy/discussions/12230 + + +# For exploring some files directly: +from sqlalchemy.inspection import inspect # noqa: F401 +import sqlalchemy.dialects.sqlite.base # noqa: F401 +import sqlalchemy.dialects.sqlite.pysqlite # noqa: F401 + +# Test code for dialects: +from sqlalchemy.engine.default import DefaultDialect +from sqlalchemy.dialects.mssql import dialect as MSSQLDialect +from sqlalchemy.dialects.mysql import dialect as MySQLDialect +from sqlalchemy.dialects.postgresql import dialect as PostgreSQLDialect +from sqlalchemy.dialects.sqlite import dialect as SQLiteDialect + +# Third-party dialect +from databricks.sqlalchemy import DatabricksDialect + +# Create instances to explore: +default_dialect = DefaultDialect() +postgresql_dialect = PostgreSQLDialect() +mssql_dialect = MSSQLDialect() +mysql_dialect = MySQLDialect() +sqlite_dialect = SQLiteDialect() +databricks_dialect = DatabricksDialect() + +print(sqlite_dialect.ischema_names) + +# The native ones all have an "ischema_names" dictionary, apart from +# DefaultDialect. The Databricks one doesn't. + +# The way SQLAlchemy does this for real is via an Inspector, which passes on +# to the Dialect. +# Inspector: https://docs.sqlalchemy.org/en/20/core/reflection.html#sqlalchemy.engine.reflection.Inspector # noqa: E501 +# Engine: https://docs.sqlalchemy.org/en/20/core/connections.html#sqlalchemy.engine.Engine # noqa: E501 +# Dialect: https://docs.sqlalchemy.org/en/14/core/internals.html#sqlalchemy.engine.Dialect # noqa: E501 +# ... get_columns() +# ... type_descriptor(), convers generic SQLA type to dialect-specific type. +# DefaultDialect: https://docs.sqlalchemy.org/en/14/core/internals.html#sqlalchemy.engine.default.DefaultDialect # noqa: E501 + +# I can't find a generic method. See discussion above: there isn't one.