diff --git a/ibis/expr/datatypes.py b/ibis/expr/datatypes.py index 301d2c496d1d..aa39bc30150c 100644 --- a/ibis/expr/datatypes.py +++ b/ibis/expr/datatypes.py @@ -129,9 +129,6 @@ def _literal_value_hash_key(self, value) -> int: class Any(DataType): - scalar = ir.AnyScalar - column = ir.AnyColumn - __slots__ = () @@ -639,6 +636,23 @@ def _literal_value_hash_key(self, value): return self, _tuplize(value.items()) +class JSON(String): + """JSON (JavaScript Object Notation) text format.""" + + scalar = ir.JSONScalar + column = ir.JSONColumn + + +class JSONB(Binary): + """JSON (JavaScript Object Notation) data stored as a binary + representation, which eliminates whitespace, duplicate keys, + and key ordering. + """ + + scalar = ir.JSONBScalar + column = ir.JSONBColumn + + class GeoSpatial(DataType): __slots__ = 'geotype', 'srid' @@ -782,6 +796,17 @@ class MultiPolygon(GeoSpatial): __slots__ = () +class UUID(String): + """A universally unique identifier (UUID) is a 128-bit number used to + identify information in computer systems. + """ + + scalar = ir.UUIDScalar + column = ir.UUIDColumn + + __slots__ = () + + # --------------------------------------------------------------------- any = Any() null = Null() @@ -818,7 +843,11 @@ class MultiPolygon(GeoSpatial): multilinestring = MultiLineString() multipoint = MultiPoint() multipolygon = MultiPolygon() - +# json +json = JSON() +jsonb = JSONB() +# special string based data type +uuid = UUID() _primitive_types = [ ('any', any), @@ -884,6 +913,9 @@ class Tokens: MULTIPOINT = 28 MULTIPOLYGON = 29 SEMICOLON = 30 + JSON = 31 + JSONB = 32 + UUID = 33 @staticmethod def name(value): @@ -894,7 +926,6 @@ def name(value): (getattr(Tokens, n), n) for n in dir(Tokens) if n.isalpha() and n.isupper() ) - Token = collections.namedtuple('Token', ('type', 'value')) @@ -1008,6 +1039,22 @@ def name(value): ), ) ] + + [ + # json data type + ( + '(?P<{}>{})'.format(token.upper(), token), + lambda token, toktype=toktype: Token(toktype, token), + ) + for token, toktype in zip( + # note: `jsonb` should be first to avoid conflict with `json` + ('jsonb', 'json'), + (Tokens.JSONB, Tokens.JSON), + ) + ] + + [ + # special string based data types + ('(?Puuid)', lambda token: Token(Tokens.UUID, token)) + ] + [ # integers, for decimal spec (r'(?P\d+)', lambda token: Token(Tokens.INTEGER, int(token))), @@ -1212,6 +1259,12 @@ def type(self) -> DataType: | "multipolygon" ":" geotype | "multipolygon" ";" srid ":" geotype + json : "json" + + jsonb : "jsonb" + + uuid : "uuid" + """ if self._accept(Tokens.PRIMITIVE): assert self.tok is not None @@ -1325,6 +1378,13 @@ def type(self) -> DataType: self._expect(Tokens.RBRACKET) return Struct(names, types) + # json data types + elif self._accept(Tokens.JSON): + return JSON() + + elif self._accept(Tokens.JSONB): + return JSONB() + # geo spatial data type elif self._accept(Tokens.GEOMETRY): return Geometry() @@ -1434,6 +1494,10 @@ def type(self) -> DataType: return MultiPolygon(geotype=geotype, srid=srid) + # special string based data types + elif self._accept(Tokens.UUID): + return UUID() + else: raise SyntaxError('Type cannot be parsed: {}'.format(self.text)) @@ -1766,6 +1830,16 @@ def can_cast_variadic( return castable(source.value_type, target.value_type) +@castable.register(JSON, JSON) +def can_cast_json(source, target, **kwargs): + return True + + +@castable.register(JSONB, JSONB) +def can_cast_jsonb(source, target, **kwargs): + return True + + # geo spatial data type # cast between same type, used to cast from/to geometry and geography GEO_TYPES = ( @@ -1785,6 +1859,11 @@ def can_cast_geospatial(source, target, **kwargs): return True +@castable.register(UUID, UUID) +def can_cast_special_string(source, target, **kwargs): + return True + + # @castable.register(Map, Map) # def can_cast_maps(source, target): # return (source.equals(target) or diff --git a/ibis/expr/types.py b/ibis/expr/types.py index 6262c2355c4c..1abcae398843 100644 --- a/ibis/expr/types.py +++ b/ibis/expr/types.py @@ -794,6 +794,30 @@ class MapColumn(AnyColumn, MapValue): pass # noqa: E701,E302 +class JSONValue(StringValue): + pass # noqa: E701,E302 + + +class JSONScalar(StringScalar, JSONValue): + pass # noqa: E701,E302 + + +class JSONColumn(StringColumn, JSONValue): + pass # noqa: E701,E302 + + +class JSONBValue(BinaryValue): + pass # noqa: E701,E302 + + +class JSONBScalar(BinaryScalar, JSONBValue): + pass # noqa: E701,E302 + + +class JSONBColumn(BinaryColumn, JSONBValue): + pass # noqa: E701,E302 + + class StructValue(AnyValue): def __dir__(self): return sorted( @@ -909,6 +933,18 @@ class MultiPolygonColumn(GeoSpatialColumn, MultiPolygonValue): # noqa: E302 pass # noqa: E701 +class UUIDValue(StringValue): + pass # noqa: E701,E302 + + +class UUIDScalar(StringScalar, UUIDValue): + pass # noqa: E701,E302 + + +class UUIDColumn(StringColumn, UUIDValue): + pass # noqa: E701,E302 + + class ListExpr(ColumnExpr, AnyValue): @property def values(self): diff --git a/ibis/sql/alchemy.py b/ibis/sql/alchemy.py index 62102e5537b5..cf81263248c9 100644 --- a/ibis/sql/alchemy.py +++ b/ibis/sql/alchemy.py @@ -138,17 +138,17 @@ def sa_double(_, satype, nullable=True): @dt.dtype.register(PostgreSQLDialect, sa.dialects.postgresql.UUID) def sa_uuid(_, satype, nullable=True): - return dt.Any(nullable=nullable) + return dt.UUID(nullable=nullable) @dt.dtype.register(PostgreSQLDialect, sa.dialects.postgresql.JSON) def sa_json(_, satype, nullable=True): - return dt.Any(nullable=nullable) + return dt.JSON(nullable=nullable) @dt.dtype.register(PostgreSQLDialect, sa.dialects.postgresql.JSONB) def sa_jsonb(_, satype, nullable=True): - return dt.Any(nullable=nullable) + return dt.JSONB(nullable=nullable) if geospatial_supported: diff --git a/ibis/sql/postgres/tests/test_client.py b/ibis/sql/postgres/tests/test_client.py index b38c86163b7d..a619f1116944 100644 --- a/ibis/sql/postgres/tests/test_client.py +++ b/ibis/sql/postgres/tests/test_client.py @@ -137,12 +137,12 @@ def test_schema_table(): assert isinstance(schema['tables'], ir.TableExpr) -def test_schema_unsupported_type_conversion(): +def test_schema_type_conversion(): typespec = [ # name, type, nullable - ('json', sa.dialects.postgresql.JSON, True, dt.any), - ('jsonb', sa.dialects.postgresql.JSONB, True, dt.any), - ('uuid', sa.dialects.postgresql.UUID, True, dt.any), + ('json', sa.dialects.postgresql.JSON, True, dt.JSON), + ('jsonb', sa.dialects.postgresql.JSONB, True, dt.JSONB), + ('uuid', sa.dialects.postgresql.UUID, True, dt.UUID), ] sqla_types = [] diff --git a/ibis/tests/all/test_json.py b/ibis/tests/all/test_json.py new file mode 100644 index 000000000000..bf3cd3d4848d --- /dev/null +++ b/ibis/tests/all/test_json.py @@ -0,0 +1,31 @@ +""" Tests for json data types""" +import json + +import pytest +from pytest import param + +import ibis +from ibis.tests.backends import PostgreSQL + +# add here backends that support json types +all_db_geo_supported = [PostgreSQL] + + +@pytest.mark.parametrize('data', [param({'status': True}, id='status')]) +@pytest.mark.only_on_backends(all_db_geo_supported) +def test_json(backend, con, data, alltypes): + json_value = json.dumps(data) + lit = ibis.literal(json_value, type='json').name('tmp') + expr = alltypes[[alltypes.id, lit]].head(1) + df = expr.execute() + assert df['tmp'].iloc[0] == json_value + + +@pytest.mark.parametrize('data', [param({'status': True}, id='status')]) +@pytest.mark.only_on_backends(all_db_geo_supported) +def test_jsonb(backend, con, data, alltypes): + jsonb_value = json.dumps(data).encode('utf8') + lit = ibis.literal(jsonb_value, type='jsonb').name('tmp') + expr = alltypes[[alltypes.id, lit]].head(1) + df = expr.execute() + assert df['tmp'].iloc[0] == jsonb_value diff --git a/ibis/tests/all/test_string.py b/ibis/tests/all/test_string.py index ccf73f181b14..84541426ab41 100644 --- a/ibis/tests/all/test_string.py +++ b/ibis/tests/all/test_string.py @@ -3,7 +3,7 @@ import ibis import ibis.expr.datatypes as dt -from ibis.tests.backends import Clickhouse, Impala, PySpark, Spark +from ibis.tests.backends import Clickhouse, Impala, PostgreSQL, PySpark, Spark def test_string_col_is_unicode(backend, alltypes, df): @@ -233,3 +233,15 @@ def test_string(backend, alltypes, df, result_func, expected_func): expected = backend.default_series_rename(expected_func(df)) backend.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + 'data, data_type', + [param('123e4567-e89b-12d3-a456-426655440000', 'uuid', id='uuid')], +) +@pytest.mark.only_on_backends([PostgreSQL]) +def test_special_strings(backend, con, alltypes, data, data_type): + lit = ibis.literal(data, type=data_type).name('tmp') + expr = alltypes[[alltypes.id, lit]].head(1) + df = expr.execute() + assert df['tmp'].iloc[0] == data