diff --git a/ibis/expr/datatypes.py b/ibis/expr/datatypes.py index 599a04d6d539..aa39bc30150c 100644 --- a/ibis/expr/datatypes.py +++ b/ibis/expr/datatypes.py @@ -636,6 +636,23 @@ def _literal_value_hash_key(self, value): return self, _tuplize(value.items()) +class JSON(String): + """JSON (JavaScript Object Notation) text format.""" + + scalar = ir.JSONScalar + column = ir.JSONColumn + + +class JSONB(Binary): + """JSON (JavaScript Object Notation) data stored as a binary + representation, which eliminates whitespace, duplicate keys, + and key ordering. + """ + + scalar = ir.JSONBScalar + column = ir.JSONBColumn + + class GeoSpatial(DataType): __slots__ = 'geotype', 'srid' @@ -779,6 +796,17 @@ class MultiPolygon(GeoSpatial): __slots__ = () +class UUID(String): + """A universally unique identifier (UUID) is a 128-bit number used to + identify information in computer systems. + """ + + scalar = ir.UUIDScalar + column = ir.UUIDColumn + + __slots__ = () + + # --------------------------------------------------------------------- any = Any() null = Null() @@ -815,7 +843,11 @@ class MultiPolygon(GeoSpatial): multilinestring = MultiLineString() multipoint = MultiPoint() multipolygon = MultiPolygon() - +# json +json = JSON() +jsonb = JSONB() +# special string based data type +uuid = UUID() _primitive_types = [ ('any', any), @@ -881,6 +913,9 @@ class Tokens: MULTIPOINT = 28 MULTIPOLYGON = 29 SEMICOLON = 30 + JSON = 31 + JSONB = 32 + UUID = 33 @staticmethod def name(value): @@ -891,7 +926,6 @@ def name(value): (getattr(Tokens, n), n) for n in dir(Tokens) if n.isalpha() and n.isupper() ) - Token = collections.namedtuple('Token', ('type', 'value')) @@ -1005,6 +1039,22 @@ def name(value): ), ) ] + + [ + # json data type + ( + '(?P<{}>{})'.format(token.upper(), token), + lambda token, toktype=toktype: Token(toktype, token), + ) + for token, toktype in zip( + # note: `jsonb` should be first to avoid conflict with `json` + ('jsonb', 'json'), + (Tokens.JSONB, Tokens.JSON), + ) + ] + + [ + # special string based data types + ('(?Puuid)', lambda token: Token(Tokens.UUID, token)) + ] + [ # integers, for decimal spec (r'(?P\d+)', lambda token: Token(Tokens.INTEGER, int(token))), @@ -1209,6 +1259,12 @@ def type(self) -> DataType: | "multipolygon" ":" geotype | "multipolygon" ";" srid ":" geotype + json : "json" + + jsonb : "jsonb" + + uuid : "uuid" + """ if self._accept(Tokens.PRIMITIVE): assert self.tok is not None @@ -1322,6 +1378,13 @@ def type(self) -> DataType: self._expect(Tokens.RBRACKET) return Struct(names, types) + # json data types + elif self._accept(Tokens.JSON): + return JSON() + + elif self._accept(Tokens.JSONB): + return JSONB() + # geo spatial data type elif self._accept(Tokens.GEOMETRY): return Geometry() @@ -1431,6 +1494,10 @@ def type(self) -> DataType: return MultiPolygon(geotype=geotype, srid=srid) + # special string based data types + elif self._accept(Tokens.UUID): + return UUID() + else: raise SyntaxError('Type cannot be parsed: {}'.format(self.text)) @@ -1763,6 +1830,16 @@ def can_cast_variadic( return castable(source.value_type, target.value_type) +@castable.register(JSON, JSON) +def can_cast_json(source, target, **kwargs): + return True + + +@castable.register(JSONB, JSONB) +def can_cast_jsonb(source, target, **kwargs): + return True + + # geo spatial data type # cast between same type, used to cast from/to geometry and geography GEO_TYPES = ( @@ -1782,6 +1859,11 @@ def can_cast_geospatial(source, target, **kwargs): return True +@castable.register(UUID, UUID) +def can_cast_special_string(source, target, **kwargs): + return True + + # @castable.register(Map, Map) # def can_cast_maps(source, target): # return (source.equals(target) or diff --git a/ibis/expr/types.py b/ibis/expr/types.py index 6262c2355c4c..1abcae398843 100644 --- a/ibis/expr/types.py +++ b/ibis/expr/types.py @@ -794,6 +794,30 @@ class MapColumn(AnyColumn, MapValue): pass # noqa: E701,E302 +class JSONValue(StringValue): + pass # noqa: E701,E302 + + +class JSONScalar(StringScalar, JSONValue): + pass # noqa: E701,E302 + + +class JSONColumn(StringColumn, JSONValue): + pass # noqa: E701,E302 + + +class JSONBValue(BinaryValue): + pass # noqa: E701,E302 + + +class JSONBScalar(BinaryScalar, JSONBValue): + pass # noqa: E701,E302 + + +class JSONBColumn(BinaryColumn, JSONBValue): + pass # noqa: E701,E302 + + class StructValue(AnyValue): def __dir__(self): return sorted( @@ -909,6 +933,18 @@ class MultiPolygonColumn(GeoSpatialColumn, MultiPolygonValue): # noqa: E302 pass # noqa: E701 +class UUIDValue(StringValue): + pass # noqa: E701,E302 + + +class UUIDScalar(StringScalar, UUIDValue): + pass # noqa: E701,E302 + + +class UUIDColumn(StringColumn, UUIDValue): + pass # noqa: E701,E302 + + class ListExpr(ColumnExpr, AnyValue): @property def values(self): diff --git a/ibis/sql/alchemy.py b/ibis/sql/alchemy.py index 8d4621cd5011..cf81263248c9 100644 --- a/ibis/sql/alchemy.py +++ b/ibis/sql/alchemy.py @@ -136,6 +136,21 @@ def sa_double(_, satype, nullable=True): return dt.Double(nullable=nullable) +@dt.dtype.register(PostgreSQLDialect, sa.dialects.postgresql.UUID) +def sa_uuid(_, satype, nullable=True): + return dt.UUID(nullable=nullable) + + +@dt.dtype.register(PostgreSQLDialect, sa.dialects.postgresql.JSON) +def sa_json(_, satype, nullable=True): + return dt.JSON(nullable=nullable) + + +@dt.dtype.register(PostgreSQLDialect, sa.dialects.postgresql.JSONB) +def sa_jsonb(_, satype, nullable=True): + return dt.JSONB(nullable=nullable) + + if geospatial_supported: @dt.dtype.register(SQLAlchemyDialect, (ga.Geometry, ga.types._GISType)) diff --git a/ibis/sql/postgres/tests/test_client.py b/ibis/sql/postgres/tests/test_client.py index afb1a2248ac9..a619f1116944 100644 --- a/ibis/sql/postgres/tests/test_client.py +++ b/ibis/sql/postgres/tests/test_client.py @@ -21,9 +21,10 @@ import ibis import ibis.expr.datatypes as dt import ibis.expr.types as ir +import ibis.sql.alchemy as alch # noqa: E402 from ibis.tests.util import assert_equal -pytest.importorskip('sqlalchemy') +sa = pytest.importorskip('sqlalchemy') pytest.importorskip('psycopg2') pytestmark = pytest.mark.postgresql @@ -136,6 +137,33 @@ def test_schema_table(): assert isinstance(schema['tables'], ir.TableExpr) +def test_schema_type_conversion(): + typespec = [ + # name, type, nullable + ('json', sa.dialects.postgresql.JSON, True, dt.JSON), + ('jsonb', sa.dialects.postgresql.JSONB, True, dt.JSONB), + ('uuid', sa.dialects.postgresql.UUID, True, dt.UUID), + ] + + sqla_types = [] + ibis_types = [] + for name, t, nullable, ibis_type in typespec: + sqla_type = sa.Column(name, t, nullable=nullable) + sqla_types.append(sqla_type) + ibis_types.append((name, ibis_type(nullable=nullable))) + + # Create a table with placeholder stubs for JSON, JSONB, and UUID. + engine = sa.create_engine('postgresql://') + table = sa.Table('tname', sa.MetaData(bind=engine), *sqla_types) + + # Check that we can correctly create a schema with dt.any for the + # missing types. + schema = alch.schema_from_table(table) + expected = ibis.schema(ibis_types) + + assert_equal(schema, expected) + + def test_interval_films_schema(con): t = con.table("films") assert t.len.type() == dt.Interval(unit="m") diff --git a/ibis/tests/all/test_json.py b/ibis/tests/all/test_json.py new file mode 100644 index 000000000000..bf3cd3d4848d --- /dev/null +++ b/ibis/tests/all/test_json.py @@ -0,0 +1,31 @@ +""" Tests for json data types""" +import json + +import pytest +from pytest import param + +import ibis +from ibis.tests.backends import PostgreSQL + +# add here backends that support json types +all_db_geo_supported = [PostgreSQL] + + +@pytest.mark.parametrize('data', [param({'status': True}, id='status')]) +@pytest.mark.only_on_backends(all_db_geo_supported) +def test_json(backend, con, data, alltypes): + json_value = json.dumps(data) + lit = ibis.literal(json_value, type='json').name('tmp') + expr = alltypes[[alltypes.id, lit]].head(1) + df = expr.execute() + assert df['tmp'].iloc[0] == json_value + + +@pytest.mark.parametrize('data', [param({'status': True}, id='status')]) +@pytest.mark.only_on_backends(all_db_geo_supported) +def test_jsonb(backend, con, data, alltypes): + jsonb_value = json.dumps(data).encode('utf8') + lit = ibis.literal(jsonb_value, type='jsonb').name('tmp') + expr = alltypes[[alltypes.id, lit]].head(1) + df = expr.execute() + assert df['tmp'].iloc[0] == jsonb_value diff --git a/ibis/tests/all/test_string.py b/ibis/tests/all/test_string.py index ccf73f181b14..84541426ab41 100644 --- a/ibis/tests/all/test_string.py +++ b/ibis/tests/all/test_string.py @@ -3,7 +3,7 @@ import ibis import ibis.expr.datatypes as dt -from ibis.tests.backends import Clickhouse, Impala, PySpark, Spark +from ibis.tests.backends import Clickhouse, Impala, PostgreSQL, PySpark, Spark def test_string_col_is_unicode(backend, alltypes, df): @@ -233,3 +233,15 @@ def test_string(backend, alltypes, df, result_func, expected_func): expected = backend.default_series_rename(expected_func(df)) backend.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + 'data, data_type', + [param('123e4567-e89b-12d3-a456-426655440000', 'uuid', id='uuid')], +) +@pytest.mark.only_on_backends([PostgreSQL]) +def test_special_strings(backend, con, alltypes, data, data_type): + lit = ibis.literal(data, type=data_type).name('tmp') + expr = alltypes[[alltypes.id, lit]].head(1) + df = expr.execute() + assert df['tmp'].iloc[0] == data