From b5140691abfa29efe5412887b3597456b1700dbc Mon Sep 17 00:00:00 2001 From: Ian Rose Date: Tue, 10 Sep 2019 11:24:07 -0700 Subject: [PATCH 1/4] Allow postgres client to read tables with UUID, JSON, JSONB types. --- ibis/expr/datatypes.py | 3 +++ ibis/sql/alchemy.py | 15 +++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/ibis/expr/datatypes.py b/ibis/expr/datatypes.py index 599a04d6d539..301d2c496d1d 100644 --- a/ibis/expr/datatypes.py +++ b/ibis/expr/datatypes.py @@ -129,6 +129,9 @@ def _literal_value_hash_key(self, value) -> int: class Any(DataType): + scalar = ir.AnyScalar + column = ir.AnyColumn + __slots__ = () diff --git a/ibis/sql/alchemy.py b/ibis/sql/alchemy.py index 8d4621cd5011..62102e5537b5 100644 --- a/ibis/sql/alchemy.py +++ b/ibis/sql/alchemy.py @@ -136,6 +136,21 @@ def sa_double(_, satype, nullable=True): return dt.Double(nullable=nullable) +@dt.dtype.register(PostgreSQLDialect, sa.dialects.postgresql.UUID) +def sa_uuid(_, satype, nullable=True): + return dt.Any(nullable=nullable) + + +@dt.dtype.register(PostgreSQLDialect, sa.dialects.postgresql.JSON) +def sa_json(_, satype, nullable=True): + return dt.Any(nullable=nullable) + + +@dt.dtype.register(PostgreSQLDialect, sa.dialects.postgresql.JSONB) +def sa_jsonb(_, satype, nullable=True): + return dt.Any(nullable=nullable) + + if geospatial_supported: @dt.dtype.register(SQLAlchemyDialect, (ga.Geometry, ga.types._GISType)) From 3748ef959cb9c5ad851674cb0c37cbec55cb8831 Mon Sep 17 00:00:00 2001 From: Ian Rose Date: Tue, 10 Sep 2019 11:54:58 -0700 Subject: [PATCH 2/4] Add some light type testing for json, jsonb, uuid. --- ibis/sql/tests/test_sqlalchemy.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/ibis/sql/tests/test_sqlalchemy.py b/ibis/sql/tests/test_sqlalchemy.py index 2b18d1e2497e..05daf5d3217c 100644 --- a/ibis/sql/tests/test_sqlalchemy.py +++ b/ibis/sql/tests/test_sqlalchemy.py @@ -104,6 +104,32 @@ def test_sqla_schema_conversion(self): assert_equal(schema, expected) + def test_sqla_postgres_schema_conversion(self): + typespec = [ + # name, type, nullable + ('json', sa.dialects.postgresql.JSON, True, dt.any), + ('jsonb', sa.dialects.postgresql.JSONB, True, dt.any), + ('uuid', sa.dialects.postgresql.UUID, True, dt.any), + ] + + sqla_types = [] + ibis_types = [] + for name, t, nullable, ibis_type in typespec: + sqla_type = sa.Column(name, t, nullable=nullable) + sqla_types.append(sqla_type) + ibis_types.append((name, ibis_type(nullable=nullable))) + + # Create a table with placeholder stubs for JSON, JSONB, and UUID. + engine = sa.create_engine('postgresql://') + table = sa.Table('tname', sa.MetaData(bind=engine), *sqla_types) + + # Check that we can correctly create a schema with dt.any for the + # missing types. + schema = alch.schema_from_table(table) + expected = ibis.schema(ibis_types) + + assert_equal(schema, expected) + @pytest.mark.xfail(raises=AssertionError, reason='NYT') def test_ibis_to_sqla_conversion(self): assert False From 79bab943d392bf45590ffe29c745bc0dcf9d146f Mon Sep 17 00:00:00 2001 From: Ian Rose Date: Fri, 27 Sep 2019 08:27:25 -0700 Subject: [PATCH 3/4] Move test to postgres client test suite. --- ibis/sql/postgres/tests/test_client.py | 30 +++++++++++++++++++++++++- ibis/sql/tests/test_sqlalchemy.py | 26 ---------------------- 2 files changed, 29 insertions(+), 27 deletions(-) diff --git a/ibis/sql/postgres/tests/test_client.py b/ibis/sql/postgres/tests/test_client.py index afb1a2248ac9..b38c86163b7d 100644 --- a/ibis/sql/postgres/tests/test_client.py +++ b/ibis/sql/postgres/tests/test_client.py @@ -21,9 +21,10 @@ import ibis import ibis.expr.datatypes as dt import ibis.expr.types as ir +import ibis.sql.alchemy as alch # noqa: E402 from ibis.tests.util import assert_equal -pytest.importorskip('sqlalchemy') +sa = pytest.importorskip('sqlalchemy') pytest.importorskip('psycopg2') pytestmark = pytest.mark.postgresql @@ -136,6 +137,33 @@ def test_schema_table(): assert isinstance(schema['tables'], ir.TableExpr) +def test_schema_unsupported_type_conversion(): + typespec = [ + # name, type, nullable + ('json', sa.dialects.postgresql.JSON, True, dt.any), + ('jsonb', sa.dialects.postgresql.JSONB, True, dt.any), + ('uuid', sa.dialects.postgresql.UUID, True, dt.any), + ] + + sqla_types = [] + ibis_types = [] + for name, t, nullable, ibis_type in typespec: + sqla_type = sa.Column(name, t, nullable=nullable) + sqla_types.append(sqla_type) + ibis_types.append((name, ibis_type(nullable=nullable))) + + # Create a table with placeholder stubs for JSON, JSONB, and UUID. + engine = sa.create_engine('postgresql://') + table = sa.Table('tname', sa.MetaData(bind=engine), *sqla_types) + + # Check that we can correctly create a schema with dt.any for the + # missing types. + schema = alch.schema_from_table(table) + expected = ibis.schema(ibis_types) + + assert_equal(schema, expected) + + def test_interval_films_schema(con): t = con.table("films") assert t.len.type() == dt.Interval(unit="m") diff --git a/ibis/sql/tests/test_sqlalchemy.py b/ibis/sql/tests/test_sqlalchemy.py index 05daf5d3217c..2b18d1e2497e 100644 --- a/ibis/sql/tests/test_sqlalchemy.py +++ b/ibis/sql/tests/test_sqlalchemy.py @@ -104,32 +104,6 @@ def test_sqla_schema_conversion(self): assert_equal(schema, expected) - def test_sqla_postgres_schema_conversion(self): - typespec = [ - # name, type, nullable - ('json', sa.dialects.postgresql.JSON, True, dt.any), - ('jsonb', sa.dialects.postgresql.JSONB, True, dt.any), - ('uuid', sa.dialects.postgresql.UUID, True, dt.any), - ] - - sqla_types = [] - ibis_types = [] - for name, t, nullable, ibis_type in typespec: - sqla_type = sa.Column(name, t, nullable=nullable) - sqla_types.append(sqla_type) - ibis_types.append((name, ibis_type(nullable=nullable))) - - # Create a table with placeholder stubs for JSON, JSONB, and UUID. - engine = sa.create_engine('postgresql://') - table = sa.Table('tname', sa.MetaData(bind=engine), *sqla_types) - - # Check that we can correctly create a schema with dt.any for the - # missing types. - schema = alch.schema_from_table(table) - expected = ibis.schema(ibis_types) - - assert_equal(schema, expected) - @pytest.mark.xfail(raises=AssertionError, reason='NYT') def test_ibis_to_sqla_conversion(self): assert False From 298efc2600b03029ebf8d5d50b600654cfd2e5c2 Mon Sep 17 00:00:00 2001 From: Ivan Ogasawara Date: Fri, 27 Sep 2019 20:46:37 -0400 Subject: [PATCH 4/4] Added JSON JSONB and UUID data types. --- ibis/expr/datatypes.py | 89 ++++++++++++++++++++++++-- ibis/expr/types.py | 36 +++++++++++ ibis/sql/alchemy.py | 6 +- ibis/sql/postgres/tests/test_client.py | 8 +-- ibis/tests/all/test_json.py | 31 +++++++++ ibis/tests/all/test_string.py | 14 +++- 6 files changed, 171 insertions(+), 13 deletions(-) create mode 100644 ibis/tests/all/test_json.py diff --git a/ibis/expr/datatypes.py b/ibis/expr/datatypes.py index 301d2c496d1d..aa39bc30150c 100644 --- a/ibis/expr/datatypes.py +++ b/ibis/expr/datatypes.py @@ -129,9 +129,6 @@ def _literal_value_hash_key(self, value) -> int: class Any(DataType): - scalar = ir.AnyScalar - column = ir.AnyColumn - __slots__ = () @@ -639,6 +636,23 @@ def _literal_value_hash_key(self, value): return self, _tuplize(value.items()) +class JSON(String): + """JSON (JavaScript Object Notation) text format.""" + + scalar = ir.JSONScalar + column = ir.JSONColumn + + +class JSONB(Binary): + """JSON (JavaScript Object Notation) data stored as a binary + representation, which eliminates whitespace, duplicate keys, + and key ordering. + """ + + scalar = ir.JSONBScalar + column = ir.JSONBColumn + + class GeoSpatial(DataType): __slots__ = 'geotype', 'srid' @@ -782,6 +796,17 @@ class MultiPolygon(GeoSpatial): __slots__ = () +class UUID(String): + """A universally unique identifier (UUID) is a 128-bit number used to + identify information in computer systems. + """ + + scalar = ir.UUIDScalar + column = ir.UUIDColumn + + __slots__ = () + + # --------------------------------------------------------------------- any = Any() null = Null() @@ -818,7 +843,11 @@ class MultiPolygon(GeoSpatial): multilinestring = MultiLineString() multipoint = MultiPoint() multipolygon = MultiPolygon() - +# json +json = JSON() +jsonb = JSONB() +# special string based data type +uuid = UUID() _primitive_types = [ ('any', any), @@ -884,6 +913,9 @@ class Tokens: MULTIPOINT = 28 MULTIPOLYGON = 29 SEMICOLON = 30 + JSON = 31 + JSONB = 32 + UUID = 33 @staticmethod def name(value): @@ -894,7 +926,6 @@ def name(value): (getattr(Tokens, n), n) for n in dir(Tokens) if n.isalpha() and n.isupper() ) - Token = collections.namedtuple('Token', ('type', 'value')) @@ -1008,6 +1039,22 @@ def name(value): ), ) ] + + [ + # json data type + ( + '(?P<{}>{})'.format(token.upper(), token), + lambda token, toktype=toktype: Token(toktype, token), + ) + for token, toktype in zip( + # note: `jsonb` should be first to avoid conflict with `json` + ('jsonb', 'json'), + (Tokens.JSONB, Tokens.JSON), + ) + ] + + [ + # special string based data types + ('(?Puuid)', lambda token: Token(Tokens.UUID, token)) + ] + [ # integers, for decimal spec (r'(?P\d+)', lambda token: Token(Tokens.INTEGER, int(token))), @@ -1212,6 +1259,12 @@ def type(self) -> DataType: | "multipolygon" ":" geotype | "multipolygon" ";" srid ":" geotype + json : "json" + + jsonb : "jsonb" + + uuid : "uuid" + """ if self._accept(Tokens.PRIMITIVE): assert self.tok is not None @@ -1325,6 +1378,13 @@ def type(self) -> DataType: self._expect(Tokens.RBRACKET) return Struct(names, types) + # json data types + elif self._accept(Tokens.JSON): + return JSON() + + elif self._accept(Tokens.JSONB): + return JSONB() + # geo spatial data type elif self._accept(Tokens.GEOMETRY): return Geometry() @@ -1434,6 +1494,10 @@ def type(self) -> DataType: return MultiPolygon(geotype=geotype, srid=srid) + # special string based data types + elif self._accept(Tokens.UUID): + return UUID() + else: raise SyntaxError('Type cannot be parsed: {}'.format(self.text)) @@ -1766,6 +1830,16 @@ def can_cast_variadic( return castable(source.value_type, target.value_type) +@castable.register(JSON, JSON) +def can_cast_json(source, target, **kwargs): + return True + + +@castable.register(JSONB, JSONB) +def can_cast_jsonb(source, target, **kwargs): + return True + + # geo spatial data type # cast between same type, used to cast from/to geometry and geography GEO_TYPES = ( @@ -1785,6 +1859,11 @@ def can_cast_geospatial(source, target, **kwargs): return True +@castable.register(UUID, UUID) +def can_cast_special_string(source, target, **kwargs): + return True + + # @castable.register(Map, Map) # def can_cast_maps(source, target): # return (source.equals(target) or diff --git a/ibis/expr/types.py b/ibis/expr/types.py index 6262c2355c4c..1abcae398843 100644 --- a/ibis/expr/types.py +++ b/ibis/expr/types.py @@ -794,6 +794,30 @@ class MapColumn(AnyColumn, MapValue): pass # noqa: E701,E302 +class JSONValue(StringValue): + pass # noqa: E701,E302 + + +class JSONScalar(StringScalar, JSONValue): + pass # noqa: E701,E302 + + +class JSONColumn(StringColumn, JSONValue): + pass # noqa: E701,E302 + + +class JSONBValue(BinaryValue): + pass # noqa: E701,E302 + + +class JSONBScalar(BinaryScalar, JSONBValue): + pass # noqa: E701,E302 + + +class JSONBColumn(BinaryColumn, JSONBValue): + pass # noqa: E701,E302 + + class StructValue(AnyValue): def __dir__(self): return sorted( @@ -909,6 +933,18 @@ class MultiPolygonColumn(GeoSpatialColumn, MultiPolygonValue): # noqa: E302 pass # noqa: E701 +class UUIDValue(StringValue): + pass # noqa: E701,E302 + + +class UUIDScalar(StringScalar, UUIDValue): + pass # noqa: E701,E302 + + +class UUIDColumn(StringColumn, UUIDValue): + pass # noqa: E701,E302 + + class ListExpr(ColumnExpr, AnyValue): @property def values(self): diff --git a/ibis/sql/alchemy.py b/ibis/sql/alchemy.py index 62102e5537b5..cf81263248c9 100644 --- a/ibis/sql/alchemy.py +++ b/ibis/sql/alchemy.py @@ -138,17 +138,17 @@ def sa_double(_, satype, nullable=True): @dt.dtype.register(PostgreSQLDialect, sa.dialects.postgresql.UUID) def sa_uuid(_, satype, nullable=True): - return dt.Any(nullable=nullable) + return dt.UUID(nullable=nullable) @dt.dtype.register(PostgreSQLDialect, sa.dialects.postgresql.JSON) def sa_json(_, satype, nullable=True): - return dt.Any(nullable=nullable) + return dt.JSON(nullable=nullable) @dt.dtype.register(PostgreSQLDialect, sa.dialects.postgresql.JSONB) def sa_jsonb(_, satype, nullable=True): - return dt.Any(nullable=nullable) + return dt.JSONB(nullable=nullable) if geospatial_supported: diff --git a/ibis/sql/postgres/tests/test_client.py b/ibis/sql/postgres/tests/test_client.py index b38c86163b7d..a619f1116944 100644 --- a/ibis/sql/postgres/tests/test_client.py +++ b/ibis/sql/postgres/tests/test_client.py @@ -137,12 +137,12 @@ def test_schema_table(): assert isinstance(schema['tables'], ir.TableExpr) -def test_schema_unsupported_type_conversion(): +def test_schema_type_conversion(): typespec = [ # name, type, nullable - ('json', sa.dialects.postgresql.JSON, True, dt.any), - ('jsonb', sa.dialects.postgresql.JSONB, True, dt.any), - ('uuid', sa.dialects.postgresql.UUID, True, dt.any), + ('json', sa.dialects.postgresql.JSON, True, dt.JSON), + ('jsonb', sa.dialects.postgresql.JSONB, True, dt.JSONB), + ('uuid', sa.dialects.postgresql.UUID, True, dt.UUID), ] sqla_types = [] diff --git a/ibis/tests/all/test_json.py b/ibis/tests/all/test_json.py new file mode 100644 index 000000000000..bf3cd3d4848d --- /dev/null +++ b/ibis/tests/all/test_json.py @@ -0,0 +1,31 @@ +""" Tests for json data types""" +import json + +import pytest +from pytest import param + +import ibis +from ibis.tests.backends import PostgreSQL + +# add here backends that support json types +all_db_geo_supported = [PostgreSQL] + + +@pytest.mark.parametrize('data', [param({'status': True}, id='status')]) +@pytest.mark.only_on_backends(all_db_geo_supported) +def test_json(backend, con, data, alltypes): + json_value = json.dumps(data) + lit = ibis.literal(json_value, type='json').name('tmp') + expr = alltypes[[alltypes.id, lit]].head(1) + df = expr.execute() + assert df['tmp'].iloc[0] == json_value + + +@pytest.mark.parametrize('data', [param({'status': True}, id='status')]) +@pytest.mark.only_on_backends(all_db_geo_supported) +def test_jsonb(backend, con, data, alltypes): + jsonb_value = json.dumps(data).encode('utf8') + lit = ibis.literal(jsonb_value, type='jsonb').name('tmp') + expr = alltypes[[alltypes.id, lit]].head(1) + df = expr.execute() + assert df['tmp'].iloc[0] == jsonb_value diff --git a/ibis/tests/all/test_string.py b/ibis/tests/all/test_string.py index ccf73f181b14..84541426ab41 100644 --- a/ibis/tests/all/test_string.py +++ b/ibis/tests/all/test_string.py @@ -3,7 +3,7 @@ import ibis import ibis.expr.datatypes as dt -from ibis.tests.backends import Clickhouse, Impala, PySpark, Spark +from ibis.tests.backends import Clickhouse, Impala, PostgreSQL, PySpark, Spark def test_string_col_is_unicode(backend, alltypes, df): @@ -233,3 +233,15 @@ def test_string(backend, alltypes, df, result_func, expected_func): expected = backend.default_series_rename(expected_func(df)) backend.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + 'data, data_type', + [param('123e4567-e89b-12d3-a456-426655440000', 'uuid', id='uuid')], +) +@pytest.mark.only_on_backends([PostgreSQL]) +def test_special_strings(backend, con, alltypes, data, data_type): + lit = ibis.literal(data, type=data_type).name('tmp') + expr = alltypes[[alltypes.id, lit]].head(1) + df = expr.execute() + assert df['tmp'].iloc[0] == data