Skip to content

Commit

Permalink
Add Databricks support
Browse files Browse the repository at this point in the history
  • Loading branch information
RudolfCardinal committed Jan 8, 2025
1 parent 18af04d commit 3796047
Show file tree
Hide file tree
Showing 4 changed files with 162 additions and 24 deletions.
44 changes: 31 additions & 13 deletions cardinal_pythonlib/sql/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@
**Functions to check table/column names etc. for validity in SQL.**
This is a slight
"""

import re
Expand All @@ -41,13 +39,29 @@
# ... SQL Server is very liberal!


# - ANSI: http://jakewheat.github.io/sql-overview/sql-2011-foundation-grammar.html#predefined-type # noqa: E501
# - ANSI:
# - http://jakewheat.github.io/sql-overview/sql-2011-foundation-grammar.html#predefined-type # noqa: E501
#
# - SQL Server:
# - https://support.microsoft.com/en-us/office/equivalent-ansi-sql-data-types-7a0a6bef-ef25-45f9-8a9a-3c5f21b5c65d # noqa: E501
# - https://docs.microsoft.com/en-us/sql/t-sql/data-types/data-types-transact-sql?view=sql-server-ver15 # noqa: E501
# - https://learn.microsoft.com/en-us/sql/t-sql/data-types/data-types-transact-sql?view=sql-server-ver16 # noqa: E501
# - Note that ANSI "BIT" is SQL Server "BINARY".
# - MySQL: https://dev.mysql.com/doc/refman/8.0/en/data-types.html
# - PostgreSQL: https://www.postgresql.org/docs/9.5/datatype.html
#
# - MySQL:
# - https://dev.mysql.com/doc/refman/8.0/en/data-types.html
# - https://dev.mysql.com/doc/refman/9.1/en/data-types.html
#
# - PostgreSQL:
# - https://www.postgresql.org/docs/9.5/datatype.html
#
# - SQLite:
# - https://www.sqlite.org/datatype3.html
#
# - Databricks:
# - https://github.com/databricks/databricks-sqlalchemy

SQLTYPE_DATE = "DATE" # ANSI

SQLTYPES_INTEGER = (
"BIGINT", # ANSI
Expand All @@ -71,6 +85,12 @@
"SMALLSERIAL", # PostgreSQL
"TINYINT", # SQL Server, MySQL
)
SQLTYPES_BIT = (
"BIT VARYING", # ANSI
"BIT", # ANSI
"BOOL", # MySQL synonym for BOOLEAN or TINYINT(1)
"BOOLEAN", # ANSI
)
SQLTYPES_FLOAT = (
"DOUBLE PRECISION", # ANSI (8 bytes)
"DOUBLE", # SQL Server, MySQL; synonym for DOUBLE PRECISION
Expand All @@ -84,16 +104,13 @@
"SINGLE", # SQL Server
)
SQLTYPES_OTHER_NUMERIC = (
"BIT VARYING", # ANSI
"BIT", # ANSI
"BOOL", # MySQL synonym for BOOLEAN or TINYINT(1)
"BOOLEAN", # ANSI
"DEC", # ANSI; synonym for DECIMAL
"DECIMAL", # ANSI
"FIXED", # MySQL; synonym for DECIMAL
"LOGICAL", # SQL Server
"LOGICAL1", # SQL Server
"NUMERIC", # ANSI; synonym for DECIMAL
"SMALLMONEY", # SQL Server
"ROWVERSION", # SQL Server
"VARBIT", # PostgreSQL synonym for BIT VARYING
"YESNO", # SQL Server
Expand Down Expand Up @@ -125,8 +142,8 @@
"NTEXT", # SQL Server
"NVARCHAR", # SQL Server
"SET", # MySQL
"STRING", # SQL Server
"TEXT", # SQL Server, MySQL
"STRING", # SQL Server, Databricks
"TEXT", # SQL Server, MySQL, SQLite
"TINYTEXT", # MySQL
"VARCHAR", # ANSI
)
Expand All @@ -146,12 +163,13 @@
"VARBINARY", # ANSI
)
SQLTYPES_WITH_DATE = (
"DATE", # ANSI
"DATETIME", # SQL Server, MySQL
SQLTYPE_DATE, # ANSI
"DATETIME", # SQL Server, MySQL, most
"DATETIME2", # SQL Server
"DATETIMEOFFSET", # SQL Server (date + time + time zone)
"SMALLDATETIME", # SQL Server
"TIMESTAMP", # ANSI
"TIMESTAMP_NTZ", # Databricks
)
SQLTYPES_DATETIME_OTHER = (
"INTERVAL", # ANSI (not always supported); PostgreSQL
Expand Down
3 changes: 3 additions & 0 deletions cardinal_pythonlib/sqlalchemy/dialect.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ class SqlaDialectName(object):
SYBASE = "sybase"

# Additional third-party dialects:
# - https://docs.sqlalchemy.org/en/20/dialects/
# Interface:
# - https://docs.sqlalchemy.org/en/20/core/internals.html#sqlalchemy.engine.Dialect # noqa: E501

DATABRICKS = "databricks"
# ... https://github.com/databricks/databricks-sqlalchemy
Expand Down
87 changes: 76 additions & 11 deletions cardinal_pythonlib/sqlalchemy/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@
**Functions to work with SQLAlchemy schemas (schemata) directly, via SQLAlchemy
Core.**
Functions that have to work with specific dialect information are marked
DIALECT-AWARE.
"""

import ast
Expand Down Expand Up @@ -60,7 +63,19 @@
)
from sqlalchemy.sql import sqltypes, text
from sqlalchemy.sql.ddl import DDLElement
from sqlalchemy.sql.sqltypes import BigInteger, TypeEngine
from sqlalchemy.sql.sqltypes import (
BigInteger,
Boolean,
Date,
DateTime,
Double,
Float,
Integer,
Numeric,
SmallInteger,
Text,
TypeEngine,
)
from sqlalchemy.sql.visitors import Visitable

from cardinal_pythonlib.logs import get_brace_style_log_with_null_handler
Expand All @@ -86,6 +101,23 @@
MSSQL_DEFAULT_SCHEMA = "dbo"
POSTGRES_DEFAULT_SCHEMA = "public"

DATABRICKS_SQLCOLTYPE_TO_SQLALCHEMY_GENERIC = {
# A bit nasty: https://github.com/databricks/databricks-sqlalchemy
# Part of the reverse mapping is via
# from databricks.sqlalchemy import DatabricksDialect
# print(DatabricksDialect.colspecs)
"BIGINT": BigInteger,
"BOOLEAN": Boolean,
"DATE": Date,
"TIMESTAMP_NTZ": DateTime,
"DOUBLE": Double,
"FLOAT": Float,
"INT": Integer,
"DECIMAL": Numeric,
"SMALLINT": SmallInteger,
"STRING": Text,
}


# =============================================================================
# Inspect tables (SQLAlchemy Core)
Expand Down Expand Up @@ -498,6 +530,8 @@ def add_index(
The table name is worked out from the :class:`Column` object.
DIALECT-AWARE.
Args:
engine: SQLAlchemy :class:`Engine` object
sqla_column: single column to index
Expand Down Expand Up @@ -733,6 +767,8 @@ def giant_text_sqltype(dialect: Dialect) -> str:
Returns the SQL column type used to make very large text columns for a
given dialect.
DIALECT-AWARE.
Args:
dialect: a SQLAlchemy :class:`Dialect`
Returns:
Expand All @@ -755,6 +791,9 @@ def giant_text_sqltype(dialect: Dialect) -> str:
elif dname == SqlaDialectName.SQLITE:
return "TEXT"
# https://www.sqlite.org/datatype3.html
elif dname == SqlaDialectName.DATABRICKS:
return "STRING"
# https://github.com/databricks/databricks-sqlalchemy
else:
raise ValueError(f"Unknown dialect: {dname}")

Expand Down Expand Up @@ -787,16 +826,40 @@ def _get_sqla_coltype_class_from_str(
Returns the SQLAlchemy class corresponding to a particular SQL column
type in a given dialect.
DIALECT-AWARE.
Performs an upper- and lower-case search.
For example, the SQLite dialect uses upper case, and the
MySQL dialect uses lower case.
For exploratory thinking, see
dev_notes/convert_sql_string_coltype_to_sqlalchemy_type.py.
DISCUSSION AT: https://github.com/sqlalchemy/sqlalchemy/discussions/12230
"""
# noinspection PyUnresolvedReferences
ischema_names = dialect.ischema_names
try:
return ischema_names[coltype.upper()]
except KeyError:
return ischema_names[coltype.lower()]
if hasattr(dialect, "ischema_names"):
# The built-in dialects all have this, even though it's an internal
# detail.
ischema_names = dialect.ischema_names
try:
return ischema_names[coltype.upper()]
except KeyError:
return ischema_names[coltype.lower()]
elif dialect.name == SqlaDialectName.DATABRICKS:
# Ugly hack.
# Databricks is an example that doesn't have ischema_names.
try:
return DATABRICKS_SQLCOLTYPE_TO_SQLALCHEMY_GENERIC[coltype.upper()]
except KeyError:
raise ValueError(
f"Don't know how to convert SQL column type {coltype!r} "
f"to SQLAlchemy dialect {dialect!r}"
)
else:
raise ValueError(
f"Don't know a generic way to convert SQL column types "
f"(in text format) to SQLAlchemy dialect {dialect.name!r}. "
)


def get_list_of_sql_string_literals_from_quoted_csv(x: str) -> List[str]:
Expand Down Expand Up @@ -830,6 +893,8 @@ def get_sqla_coltype_from_dialect_str(
``coltype.compile()`` or ``coltype.compile(dialect)``; see
:class:`TypeEngine`.
DIALECT-AWARE.
Args:
dialect: a SQLAlchemy :class:`Dialect` class
Expand Down Expand Up @@ -999,6 +1064,8 @@ def convert_sqla_type_for_dialect(
"""
Converts an SQLAlchemy column type from one SQL dialect to another.
DIALECT-AWARE.
Args:
coltype: SQLAlchemy column type in the source dialect
Expand All @@ -1024,9 +1091,7 @@ def convert_sqla_type_for_dialect(
"""
assert coltype is not None

# noinspection PyUnresolvedReferences
to_mysql = dialect.name == SqlaDialectName.MYSQL
# noinspection PyUnresolvedReferences
to_mssql = dialect.name == SqlaDialectName.MSSQL
typeclass = type(coltype)

Expand Down Expand Up @@ -1201,10 +1266,10 @@ def does_sqlatype_require_index_len(


# =============================================================================
# hack_in_mssql_xml_type:
# hack_in_mssql_xml_type
# =============================================================================
#
# Removed, as mssql.base.ischema_names["xml"] is now defined.
# =============================================================================


# =============================================================================
Expand Down
52 changes: 52 additions & 0 deletions dev_notes/convert_sql_string_coltype_to_sqlalchemy_type.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# EXPLORATORY CODE ONLY.
#
# PROBLEM: Take a SQL string fragment representing a column type (e.g.
# "VARCHAR(32)", "STRING") and an SQLAlchemy dialect (a core one like mysql or
# sqlite, or a third-party one like databricks), and return the appropriate
# SQLAlchemy type as a TypeEngine class/instance.
#
# CURRENT IMPLEMENTATION:
# cardinal_pythonlib.sqlalchemy.schema.get_sqla_coltype_from_dialect_str()
# ... with its sub-function, _get_sqla_coltype_class_from_str()
#
# DISCUSSION AT: https://github.com/sqlalchemy/sqlalchemy/discussions/12230


# For exploring some files directly:
from sqlalchemy.inspection import inspect # noqa: F401
import sqlalchemy.dialects.sqlite.base # noqa: F401
import sqlalchemy.dialects.sqlite.pysqlite # noqa: F401

# Test code for dialects:
from sqlalchemy.engine.default import DefaultDialect
from sqlalchemy.dialects.mssql import dialect as MSSQLDialect
from sqlalchemy.dialects.mysql import dialect as MySQLDialect
from sqlalchemy.dialects.postgresql import dialect as PostgreSQLDialect
from sqlalchemy.dialects.sqlite import dialect as SQLiteDialect

# Third-party dialect
from databricks.sqlalchemy import DatabricksDialect

# Create instances to explore:
default_dialect = DefaultDialect()
postgresql_dialect = PostgreSQLDialect()
mssql_dialect = MSSQLDialect()
mysql_dialect = MySQLDialect()
sqlite_dialect = SQLiteDialect()
databricks_dialect = DatabricksDialect()

print(sqlite_dialect.ischema_names)

# The native ones all have an "ischema_names" dictionary, apart from
# DefaultDialect. The Databricks one doesn't.

# The way SQLAlchemy does this for real is via an Inspector, which passes on
# to the Dialect.
# Inspector: https://docs.sqlalchemy.org/en/20/core/reflection.html#sqlalchemy.engine.reflection.Inspector # noqa: E501
# Engine: https://docs.sqlalchemy.org/en/20/core/connections.html#sqlalchemy.engine.Engine # noqa: E501
# Dialect: https://docs.sqlalchemy.org/en/14/core/internals.html#sqlalchemy.engine.Dialect # noqa: E501
# ... get_columns()
# ... type_descriptor(), convers generic SQLA type to dialect-specific type.
# DefaultDialect: https://docs.sqlalchemy.org/en/14/core/internals.html#sqlalchemy.engine.default.DefaultDialect # noqa: E501

# I can't find a generic method. See discussion above: there isn't one.

0 comments on commit 3796047

Please sign in to comment.