Added JSON JSONB and UUID data types.

costrouc · Sep 28, 2019 · 298efc2 · 298efc2
1 parent 79bab94
commit 298efc2
Show file tree

Hide file tree

Showing 6 changed files with 171 additions and 13 deletions.
diff --git a/ibis/expr/datatypes.py b/ibis/expr/datatypes.py
@@ -129,9 +129,6 @@ def _literal_value_hash_key(self, value) -> int:
 
 
 class Any(DataType):
-    scalar = ir.AnyScalar
-    column = ir.AnyColumn
-
     __slots__ = ()
 
 
@@ -639,6 +636,23 @@ def _literal_value_hash_key(self, value):
         return self, _tuplize(value.items())
 
 
+class JSON(String):
+    """JSON (JavaScript Object Notation) text format."""
+
+    scalar = ir.JSONScalar
+    column = ir.JSONColumn
+
+
+class JSONB(Binary):
+    """JSON (JavaScript Object Notation) data stored as a binary
+    representation, which eliminates whitespace, duplicate keys,
+    and key ordering.
+    """
+
+    scalar = ir.JSONBScalar
+    column = ir.JSONBColumn
+
+
 class GeoSpatial(DataType):
     __slots__ = 'geotype', 'srid'
 
@@ -782,6 +796,17 @@ class MultiPolygon(GeoSpatial):
     __slots__ = ()
 
 
+class UUID(String):
+    """A universally unique identifier (UUID) is a 128-bit number used to
+    identify information in computer systems.
+    """
+
+    scalar = ir.UUIDScalar
+    column = ir.UUIDColumn
+
+    __slots__ = ()
+
+
 # ---------------------------------------------------------------------
 any = Any()
 null = Null()
@@ -818,7 +843,11 @@ class MultiPolygon(GeoSpatial):
 multilinestring = MultiLineString()
 multipoint = MultiPoint()
 multipolygon = MultiPolygon()
-
+# json
+json = JSON()
+jsonb = JSONB()
+# special string based data type
+uuid = UUID()
 
 _primitive_types = [
     ('any', any),
@@ -884,6 +913,9 @@ class Tokens:
     MULTIPOINT = 28
     MULTIPOLYGON = 29
     SEMICOLON = 30
+    JSON = 31
+    JSONB = 32
+    UUID = 33
 
     @staticmethod
     def name(value):
@@ -894,7 +926,6 @@ def name(value):
     (getattr(Tokens, n), n) for n in dir(Tokens) if n.isalpha() and n.isupper()
 )
 
-
 Token = collections.namedtuple('Token', ('type', 'value'))
 
 
@@ -1008,6 +1039,22 @@ def name(value):
             ),
         )
     ]
+    + [
+        # json data type
+        (
+            '(?P<{}>{})'.format(token.upper(), token),
+            lambda token, toktype=toktype: Token(toktype, token),
+        )
+        for token, toktype in zip(
+            # note: `jsonb` should be first to avoid conflict with `json`
+            ('jsonb', 'json'),
+            (Tokens.JSONB, Tokens.JSON),
+        )
+    ]
+    + [
+        # special string based data types
+        ('(?P<UUID>uuid)', lambda token: Token(Tokens.UUID, token))
+    ]
     + [
         # integers, for decimal spec
         (r'(?P<INTEGER>\d+)', lambda token: Token(Tokens.INTEGER, int(token))),
@@ -1212,6 +1259,12 @@ def type(self) -> DataType:
                      | "multipolygon" ":" geotype
                      | "multipolygon" ";" srid ":" geotype
 
+        json : "json"
+
+        jsonb : "jsonb"
+
+        uuid : "uuid"
+
         """
         if self._accept(Tokens.PRIMITIVE):
             assert self.tok is not None
@@ -1325,6 +1378,13 @@ def type(self) -> DataType:
             self._expect(Tokens.RBRACKET)
             return Struct(names, types)
 
+        # json data types
+        elif self._accept(Tokens.JSON):
+            return JSON()
+
+        elif self._accept(Tokens.JSONB):
+            return JSONB()
+
         # geo spatial data type
         elif self._accept(Tokens.GEOMETRY):
             return Geometry()
@@ -1434,6 +1494,10 @@ def type(self) -> DataType:
 
             return MultiPolygon(geotype=geotype, srid=srid)
 
+        # special string based data types
+        elif self._accept(Tokens.UUID):
+            return UUID()
+
         else:
             raise SyntaxError('Type cannot be parsed: {}'.format(self.text))
 
@@ -1766,6 +1830,16 @@ def can_cast_variadic(
     return castable(source.value_type, target.value_type)
 
 
+@castable.register(JSON, JSON)
+def can_cast_json(source, target, **kwargs):
+    return True
+
+
+@castable.register(JSONB, JSONB)
+def can_cast_jsonb(source, target, **kwargs):
+    return True
+
+
 # geo spatial data type
 # cast between same type, used to cast from/to geometry and geography
 GEO_TYPES = (
@@ -1785,6 +1859,11 @@ def can_cast_geospatial(source, target, **kwargs):
     return True
 
 
+@castable.register(UUID, UUID)
+def can_cast_special_string(source, target, **kwargs):
+    return True
+
+
 # @castable.register(Map, Map)
 # def can_cast_maps(source, target):
 #     return (source.equals(target) or

diff --git a/ibis/expr/types.py b/ibis/expr/types.py
@@ -794,6 +794,30 @@ class MapColumn(AnyColumn, MapValue):
     pass  # noqa: E701,E302
 
 
+class JSONValue(StringValue):
+    pass  # noqa: E701,E302
+
+
+class JSONScalar(StringScalar, JSONValue):
+    pass  # noqa: E701,E302
+
+
+class JSONColumn(StringColumn, JSONValue):
+    pass  # noqa: E701,E302
+
+
+class JSONBValue(BinaryValue):
+    pass  # noqa: E701,E302
+
+
+class JSONBScalar(BinaryScalar, JSONBValue):
+    pass  # noqa: E701,E302
+
+
+class JSONBColumn(BinaryColumn, JSONBValue):
+    pass  # noqa: E701,E302
+
+
 class StructValue(AnyValue):
     def __dir__(self):
         return sorted(
@@ -909,6 +933,18 @@ class MultiPolygonColumn(GeoSpatialColumn, MultiPolygonValue):  # noqa: E302
     pass  # noqa: E701
 
 
+class UUIDValue(StringValue):
+    pass  # noqa: E701,E302
+
+
+class UUIDScalar(StringScalar, UUIDValue):
+    pass  # noqa: E701,E302
+
+
+class UUIDColumn(StringColumn, UUIDValue):
+    pass  # noqa: E701,E302
+
+
 class ListExpr(ColumnExpr, AnyValue):
     @property
     def values(self):

diff --git a/ibis/sql/alchemy.py b/ibis/sql/alchemy.py
@@ -138,17 +138,17 @@ def sa_double(_, satype, nullable=True):
 
 @dt.dtype.register(PostgreSQLDialect, sa.dialects.postgresql.UUID)
 def sa_uuid(_, satype, nullable=True):
-    return dt.Any(nullable=nullable)
+    return dt.UUID(nullable=nullable)
 
 
 @dt.dtype.register(PostgreSQLDialect, sa.dialects.postgresql.JSON)
 def sa_json(_, satype, nullable=True):
-    return dt.Any(nullable=nullable)
+    return dt.JSON(nullable=nullable)
 
 
 @dt.dtype.register(PostgreSQLDialect, sa.dialects.postgresql.JSONB)
 def sa_jsonb(_, satype, nullable=True):
-    return dt.Any(nullable=nullable)
+    return dt.JSONB(nullable=nullable)
 
 
 if geospatial_supported:

diff --git a/ibis/sql/postgres/tests/test_client.py b/ibis/sql/postgres/tests/test_client.py
@@ -137,12 +137,12 @@ def test_schema_table():
     assert isinstance(schema['tables'], ir.TableExpr)
 
 
-def test_schema_unsupported_type_conversion():
+def test_schema_type_conversion():
     typespec = [
         # name, type, nullable
-        ('json', sa.dialects.postgresql.JSON, True, dt.any),
-        ('jsonb', sa.dialects.postgresql.JSONB, True, dt.any),
-        ('uuid', sa.dialects.postgresql.UUID, True, dt.any),
+        ('json', sa.dialects.postgresql.JSON, True, dt.JSON),
+        ('jsonb', sa.dialects.postgresql.JSONB, True, dt.JSONB),
+        ('uuid', sa.dialects.postgresql.UUID, True, dt.UUID),
     ]
 
     sqla_types = []

diff --git a/ibis/tests/all/test_json.py b/ibis/tests/all/test_json.py
@@ -0,0 +1,31 @@
+""" Tests for json data types"""
+import json
+
+import pytest
+from pytest import param
+
+import ibis
+from ibis.tests.backends import PostgreSQL
+
+# add here backends that support json types
+all_db_geo_supported = [PostgreSQL]
+
+
+@pytest.mark.parametrize('data', [param({'status': True}, id='status')])
+@pytest.mark.only_on_backends(all_db_geo_supported)
+def test_json(backend, con, data, alltypes):
+    json_value = json.dumps(data)
+    lit = ibis.literal(json_value, type='json').name('tmp')
+    expr = alltypes[[alltypes.id, lit]].head(1)
+    df = expr.execute()
+    assert df['tmp'].iloc[0] == json_value
+
+
+@pytest.mark.parametrize('data', [param({'status': True}, id='status')])
+@pytest.mark.only_on_backends(all_db_geo_supported)
+def test_jsonb(backend, con, data, alltypes):
+    jsonb_value = json.dumps(data).encode('utf8')
+    lit = ibis.literal(jsonb_value, type='jsonb').name('tmp')
+    expr = alltypes[[alltypes.id, lit]].head(1)
+    df = expr.execute()
+    assert df['tmp'].iloc[0] == jsonb_value
diff --git a/ibis/tests/all/test_string.py b/ibis/tests/all/test_string.py
@@ -3,7 +3,7 @@
 
 import ibis
 import ibis.expr.datatypes as dt
-from ibis.tests.backends import Clickhouse, Impala, PySpark, Spark
+from ibis.tests.backends import Clickhouse, Impala, PostgreSQL, PySpark, Spark
 
 
 def test_string_col_is_unicode(backend, alltypes, df):
@@ -233,3 +233,15 @@ def test_string(backend, alltypes, df, result_func, expected_func):
 
     expected = backend.default_series_rename(expected_func(df))
     backend.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    'data, data_type',
+    [param('123e4567-e89b-12d3-a456-426655440000', 'uuid', id='uuid')],
+)
+@pytest.mark.only_on_backends([PostgreSQL])
+def test_special_strings(backend, con, alltypes, data, data_type):
+    lit = ibis.literal(data, type=data_type).name('tmp')
+    expr = alltypes[[alltypes.id, lit]].head(1)
+    df = expr.execute()
+    assert df['tmp'].iloc[0] == data