diff --git a/python/src/iceberg/types.py b/python/src/iceberg/types.py index 5a8ce2b9114d..af1f595663f8 100644 --- a/python/src/iceberg/types.py +++ b/python/src/iceberg/types.py @@ -21,17 +21,15 @@ Example: >>> StructType( - [ NestedField(True, 1, "required_field", StringType()), - NestedField(False, 2, "optional_field", IntegerType()), - ] + NestedField(False, 2, "optional_field", IntegerType()) ) Notes: - https://iceberg.apache.org/#spec/#primitive-types """ -from typing import Optional +from typing import Dict, Optional, Tuple class Singleton: @@ -43,11 +41,15 @@ def __new__(cls, *args, **kwargs): return cls._instance -class Type: - def __init__(self, type_string: str, repr_string: str, is_primitive=False): +class IcebergType: + """Base type for all Iceberg Types""" + + _initialized = False + + def __init__(self, type_string: str, repr_string: str): self._type_string = type_string self._repr_string = repr_string - self._is_primitive = is_primitive + self._initialized = True def __repr__(self): return self._repr_string @@ -57,33 +59,64 @@ def __str__(self): @property def is_primitive(self) -> bool: - return self._is_primitive + return isinstance(self, PrimitiveType) + + +class PrimitiveType(IcebergType): + """Base class for all Iceberg Primitive Types""" + + +class FixedType(PrimitiveType): + """A fixed data type in Iceberg. + + Example: + >>> FixedType(8) + FixedType(length=8) + >>> FixedType(8)==FixedType(8) + True + """ + _instances: Dict[int, "FixedType"] = {} + + def __new__(cls, length: int): + cls._instances[length] = cls._instances.get(length) or object.__new__(cls) + return cls._instances[length] -class FixedType(Type): def __init__(self, length: int): - super().__init__(f"fixed[{length}]", f"FixedType(length={length})", is_primitive=True) - self._length = length + if not self._initialized: + super().__init__(f"fixed[{length}]", f"FixedType(length={length})") + self._length = length @property def length(self) -> int: return self._length - def __eq__(self, other): - if type(self) is type(other): - return self.length == other.length - return False +class DecimalType(PrimitiveType): + """A fixed data type in Iceberg. + + Example: + >>> DecimalType(32, 3) + DecimalType(precision=32, scale=3) + >>> DecimalType(8, 3)==DecimalType(8, 3) + True + """ + + _instances: Dict[Tuple[int, int], "DecimalType"] = {} + + def __new__(cls, precision: int, scale: int): + key = (precision, scale) + cls._instances[key] = cls._instances.get(key) or object.__new__(cls) + return cls._instances[key] -class DecimalType(Type): def __init__(self, precision: int, scale: int): - super().__init__( - f"decimal({precision}, {scale})", - f"DecimalType(precision={precision}, scale={scale})", - is_primitive=True, - ) - self._precision = precision - self._scale = scale + if not self._initialized: + super().__init__( + f"decimal({precision}, {scale})", + f"DecimalType(precision={precision}, scale={scale})", + ) + self._precision = precision + self._scale = scale @property def precision(self) -> int: @@ -93,26 +126,51 @@ def precision(self) -> int: def scale(self) -> int: return self._scale - def __eq__(self, other): - if type(self) is type(other): - return self.precision == other.precision and self.scale == other.scale - return False +class NestedField(IcebergType): + """Represents a field of a struct, a map key, a map value, or a list element. + + This is where field IDs, names, docs, and nullability are tracked. + """ + + _instances: Dict[Tuple[bool, int, str, IcebergType, Optional[str]], "NestedField"] = {} + + def __new__( + cls, + field_id: int, + name: str, + field_type: IcebergType, + is_optional: bool = True, + doc: Optional[str] = None, + ): + key = (is_optional, field_id, name, field_type, doc) + cls._instances[key] = cls._instances.get(key) or object.__new__(cls) + return cls._instances[key] -class NestedField: def __init__( self, - is_optional: bool, field_id: int, name: str, - field_type: Type, + field_type: IcebergType, + is_optional: bool = True, doc: Optional[str] = None, ): - self._is_optional = is_optional - self._id = field_id - self._name = name - self._type = field_type - self._doc = doc + if not self._initialized: + docString = "" if doc is None else f", doc={repr(doc)}" + super().__init__( + ( + f"{field_id}: {name}: {'optional' if is_optional else 'required'} {field_type}" "" + if doc is None + else f" ({doc})" + ), + f"NestedField(field_id={field_id}, name={repr(name)}, field_type={repr(field_type)}, is_optional={is_optional}" + f"{docString})", + ) + self._is_optional = is_optional + self._id = field_id + self._name = name + self._type = field_type + self._doc = doc @property def is_optional(self) -> bool: @@ -135,75 +193,129 @@ def doc(self) -> Optional[str]: return self._doc @property - def type(self) -> Type: + def type(self) -> IcebergType: return self._type - def __repr__(self): - return ( - f"NestedField(is_optional={self._is_optional}, field_id={self._id}, " - f"name={repr(self._name)}, field_type={repr(self._type)}, doc={repr(self._doc)})" - ) - def __str__(self): - return ( - f"{self._id}: {self._name}: {'optional' if self._is_optional else 'required'} {self._type}" "" - if self._doc is None - else f" ({self._doc})" +class StructType(IcebergType): + """A struct type in Iceberg + + Example: + >>> StructType( + NestedField(True, 1, "required_field", StringType()), + NestedField(False, 2, "optional_field", IntegerType()) ) + """ - def __eq__(self, other): - if type(self) is type(other): - return ( - self.is_optional == other.is_optional - and self.field_id == other.field_id - and self.name == other.name - and self.doc == other.doc - and self.type == other.type - ) - return False + _instances: Dict[Tuple[NestedField, ...], "StructType"] = {} + def __new__(cls, *fields: NestedField): + cls._instances[fields] = cls._instances.get(fields) or object.__new__(cls) + return cls._instances[fields] -class StructType(Type): - def __init__(self, fields: list): - super().__init__( - f"struct<{', '.join(map(str, fields))}>", - f"StructType(fields={repr(fields)})", - ) - self._fields = fields + def __init__(self, *fields: NestedField): + if not self._initialized: + super().__init__( + f"struct<{', '.join(map(str, fields))}>", + f"StructType{repr(fields)}", + ) + self._fields = fields @property - def fields(self) -> list: + def fields(self) -> Tuple[NestedField, ...]: return self._fields - def __eq__(self, other): - if type(self) is type(other): - return self.fields == other.fields - return False +class ListType(IcebergType): + """A list type in Iceberg + + Example: + >>> ListType(element_id=3, element_type=StringType(), element_is_optional=True) + ListType(element=NestedField(is_optional=True, field_id=3, name='element', field_type=StringType(), doc=None)) + """ + + _instances: Dict[Tuple[bool, int, IcebergType], "ListType"] = {} -class ListType(Type): - def __init__(self, element: NestedField): - super().__init__(f"list<{element.type}>", f"ListType(element={repr(element)})") - self._element_field = element + def __new__( + cls, + element_id: int, + element_type: IcebergType, + element_is_optional: bool = True, + ): + key = (element_is_optional, element_id, element_type) + cls._instances[key] = cls._instances.get(key) or object.__new__(cls) + return cls._instances[key] + + def __init__( + self, + element_id: int, + element_type: IcebergType, + element_is_optional: bool = True, + ): + if not self._initialized: + super().__init__( + f"list<{element_type}>", + f"ListType(element_id={element_id}, element_type={repr(element_type)}, " + f"element_is_optional={element_is_optional})", + ) + self._element_field = NestedField( + name="element", + is_optional=element_is_optional, + field_id=element_id, + field_type=element_type, + ) @property def element(self) -> NestedField: return self._element_field - def __eq__(self, other): - if type(self) is type(other): - return self.element == other.element - return False +class MapType(IcebergType): + """A map type in Iceberg -class MapType(Type): - def __init__(self, key: NestedField, value: NestedField): - super().__init__( - f"map<{key.type}, {value.type}>", - f"MapType(key={repr(key)}, value={repr(value)})", + Example: + >>> MapType(key_id=1, key_type=StringType(), value_id=2, value_type=IntegerType(), value_is_optional=True) + MapType( + key=NestedField(is_optional=False, field_id=1, name='key', field_type=StringType(), doc=None), + value=NestedField(is_optional=True, field_id=2, name='value', field_type=IntegerType(), doc=None) ) - self._key_field = key - self._value_field = value + """ + + _instances: Dict[Tuple[int, IcebergType, int, IcebergType, bool], "MapType"] = {} + + def __new__( + cls, + key_id: int, + key_type: IcebergType, + value_id: int, + value_type: IcebergType, + value_is_optional: bool = True, + ): + impl_key = (key_id, key_type, value_id, value_type, value_is_optional) + cls._instances[impl_key] = cls._instances.get(impl_key) or object.__new__(cls) + return cls._instances[impl_key] + + def __init__( + self, + key_id: int, + key_type: IcebergType, + value_id: int, + value_type: IcebergType, + value_is_optional: bool = True, + ): + if not self._initialized: + super().__init__( + f"map<{key_type}, {value_type}>", + f"MapType(key_id={key_id}, key_type={repr(key_type)}, value_id={value_id}, value_type={repr(value_type)}, " + f"value_is_optional={value_is_optional})", + ) + self._key_field = NestedField(name="key", field_id=key_id, field_type=key_type, is_optional=False) + self._value_field = NestedField( + name="value", + field_id=value_id, + field_type=value_type, + is_optional=value_is_optional, + ) @property def key(self) -> NestedField: @@ -213,13 +325,8 @@ def key(self) -> NestedField: def value(self) -> NestedField: return self._value_field - def __eq__(self, other): - if type(self) is type(other): - return self.key == other.key and self.value == other.value - return False - -class BooleanType(Type, Singleton): +class BooleanType(PrimitiveType, Singleton): """A boolean data type in Iceberg can be represented using an instance of this class. Example: @@ -229,10 +336,11 @@ class BooleanType(Type, Singleton): """ def __init__(self): - super().__init__("boolean", "BooleanType()", is_primitive=True) + if not self._initialized: + super().__init__("boolean", "BooleanType()") -class IntegerType(Type, Singleton): +class IntegerType(PrimitiveType, Singleton): """An Integer data type in Iceberg can be represented using an instance of this class. Integers in Iceberg are 32-bit signed and can be promoted to Longs. @@ -253,10 +361,11 @@ class IntegerType(Type, Singleton): min: int = -2147483648 def __init__(self): - super().__init__("int", "IntegerType()", is_primitive=True) + if not self._initialized: + super().__init__("int", "IntegerType()") -class LongType(Type, Singleton): +class LongType(PrimitiveType, Singleton): """A Long data type in Iceberg can be represented using an instance of this class. Longs in Iceberg are 64-bit signed integers. @@ -277,10 +386,11 @@ class LongType(Type, Singleton): min: int = -9223372036854775808 def __init__(self): - super().__init__("long", "LongType()", is_primitive=True) + if not self._initialized: + super().__init__("long", "LongType()") -class FloatType(Type, Singleton): +class FloatType(PrimitiveType, Singleton): """A Float data type in Iceberg can be represented using an instance of this class. Floats in Iceberg are 32-bit IEEE 754 floating points and can be promoted to Doubles. @@ -291,10 +401,11 @@ class FloatType(Type, Singleton): """ def __init__(self): - super().__init__("float", "FloatType()", is_primitive=True) + if not self._initialized: + super().__init__("float", "FloatType()") -class DoubleType(Type, Singleton): +class DoubleType(PrimitiveType, Singleton): """A Double data type in Iceberg can be represented using an instance of this class. Doubles in Iceberg are 64-bit IEEE 754 floating points. @@ -305,10 +416,11 @@ class DoubleType(Type, Singleton): """ def __init__(self): - super().__init__("double", "DoubleType()", is_primitive=True) + if not self._initialized: + super().__init__("double", "DoubleType()") -class DateType(Type, Singleton): +class DateType(PrimitiveType, Singleton): """A Date data type in Iceberg can be represented using an instance of this class. Dates in Iceberg are calendar dates without a timezone or time. @@ -319,10 +431,11 @@ class DateType(Type, Singleton): """ def __init__(self): - super().__init__("date", "DateType()", is_primitive=True) + if not self._initialized: + super().__init__("date", "DateType()") -class TimeType(Type, Singleton): +class TimeType(PrimitiveType, Singleton): """A Time data type in Iceberg can be represented using an instance of this class. Times in Iceberg have microsecond precision and are a time of day without a date or timezone. @@ -330,14 +443,14 @@ class TimeType(Type, Singleton): >>> column_foo = TimeType() >>> isinstance(column_foo, TimeType) True - """ def __init__(self): - super().__init__("time", "TimeType()", is_primitive=True) + if not self._initialized: + super().__init__("time", "TimeType()") -class TimestampType(Type, Singleton): +class TimestampType(PrimitiveType, Singleton): """A Timestamp data type in Iceberg can be represented using an instance of this class. Timestamps in Iceberg have microsecond precision and include a date and a time of day without a timezone. @@ -345,14 +458,14 @@ class TimestampType(Type, Singleton): >>> column_foo = TimestampType() >>> isinstance(column_foo, TimestampType) True - """ def __init__(self): - super().__init__("timestamp", "TimestampType()", is_primitive=True) + if not self._initialized: + super().__init__("timestamp", "TimestampType()") -class TimestamptzType(Type, Singleton): +class TimestamptzType(PrimitiveType, Singleton): """A Timestamptz data type in Iceberg can be represented using an instance of this class. Timestamptzs in Iceberg are stored as UTC and include a date and a time of day with a timezone. @@ -363,10 +476,11 @@ class TimestamptzType(Type, Singleton): """ def __init__(self): - super().__init__("timestamptz", "TimestamptzType()", is_primitive=True) + if not self._initialized: + super().__init__("timestamptz", "TimestamptzType()") -class StringType(Type, Singleton): +class StringType(PrimitiveType, Singleton): """A String data type in Iceberg can be represented using an instance of this class. Strings in Iceberg are arbitrary-length character sequences and are encoded with UTF-8. @@ -377,10 +491,11 @@ class StringType(Type, Singleton): """ def __init__(self): - super().__init__("string", "StringType()", is_primitive=True) + if not self._initialized: + super().__init__("string", "StringType()") -class UUIDType(Type, Singleton): +class UUIDType(PrimitiveType, Singleton): """A UUID data type in Iceberg can be represented using an instance of this class. UUIDs in Iceberg are universally unique identifiers. @@ -391,10 +506,11 @@ class UUIDType(Type, Singleton): """ def __init__(self): - super().__init__("uuid", "UUIDType()", is_primitive=True) + if not self._initialized: + super().__init__("uuid", "UUIDType()") -class BinaryType(Type, Singleton): +class BinaryType(PrimitiveType, Singleton): """A Binary data type in Iceberg can be represented using an instance of this class. Binarys in Iceberg are arbitrary-length byte arrays. @@ -405,4 +521,5 @@ class BinaryType(Type, Singleton): """ def __init__(self): - super().__init__("binary", "BinaryType()", is_primitive=True) + if not self._initialized: + super().__init__("binary", "BinaryType()") diff --git a/python/tests/test_types.py b/python/tests/test_types.py index 5a6a411d8a19..07d3bcd0cfe8 100644 --- a/python/tests/test_types.py +++ b/python/tests/test_types.py @@ -59,6 +59,42 @@ def test_repr_primitive_types(input_index, input_type): assert isinstance(eval(repr(input_type())), input_type) +@pytest.mark.parametrize( + "input_type, result", + [ + (BooleanType(), True), + (IntegerType(), True), + (LongType(), True), + (FloatType(), True), + (DoubleType(), True), + (DateType(), True), + (TimeType(), True), + (TimestampType(), True), + (TimestamptzType(), True), + (StringType(), True), + (UUIDType(), True), + (BinaryType(), True), + (DecimalType(32, 3), True), + (FixedType(8), True), + (ListType(1, StringType(), True), False), + ( + MapType(1, StringType(), 2, IntegerType(), False), + False, + ), + ( + StructType( + NestedField(1, "required_field", StringType(), is_optional=False), + NestedField(2, "optional_field", IntegerType(), is_optional=True), + ), + False, + ), + (NestedField(1, "required_field", StringType(), is_optional=False), False), + ], +) +def test_is_primitive(input_type, result): + assert input_type.is_primitive == result + + def test_fixed_type(): type_var = FixedType(length=5) assert type_var.length == 5 @@ -82,41 +118,32 @@ def test_decimal_type(): def test_struct_type(): type_var = StructType( - [ - NestedField(True, 1, "optional_field", IntegerType()), - NestedField(False, 2, "required_field", FixedType(5)), - NestedField( - False, - 3, - "required_field", - StructType( - [ - NestedField(True, 4, "optional_field", DecimalType(8, 2)), - NestedField(False, 5, "required_field", LongType()), - ] - ), + NestedField(1, "optional_field", IntegerType(), is_optional=True), + NestedField(2, "required_field", FixedType(5), is_optional=False), + NestedField( + 3, + "required_field", + StructType( + NestedField(4, "optional_field", DecimalType(8, 2), is_optional=True), + NestedField(5, "required_field", LongType(), is_optional=False), ), - ] + is_optional=False, + ), ) assert len(type_var.fields) == 3 assert str(type_var) == str(eval(repr(type_var))) assert type_var == eval(repr(type_var)) - assert type_var != StructType([NestedField(True, 1, "optional_field", IntegerType())]) + assert type_var != StructType(NestedField(1, "optional_field", IntegerType(), is_optional=True)) def test_list_type(): type_var = ListType( - NestedField( - False, - 1, - "required_field", - StructType( - [ - NestedField(True, 2, "optional_field", DecimalType(8, 2)), - NestedField(False, 3, "required_field", LongType()), - ] - ), - ) + 1, + StructType( + NestedField(2, "optional_field", DecimalType(8, 2), is_optional=True), + NestedField(3, "required_field", LongType(), is_optional=False), + ), + False, ) assert isinstance(type_var.element.type, StructType) assert len(type_var.element.type.fields) == 2 @@ -124,64 +151,43 @@ def test_list_type(): assert str(type_var) == str(eval(repr(type_var))) assert type_var == eval(repr(type_var)) assert type_var != ListType( - NestedField( - True, - 1, - "required_field", - StructType( - [ - NestedField(True, 2, "optional_field", DecimalType(8, 2)), - ] - ), - ) + 1, + StructType( + NestedField(2, "optional_field", DecimalType(8, 2), is_optional=True), + ), + True, ) def test_map_type(): - type_var = MapType( - NestedField(True, 1, "optional_field", DoubleType()), - NestedField(False, 2, "required_field", UUIDType()), - ) + type_var = MapType(1, DoubleType(), 2, UUIDType(), False) assert isinstance(type_var.key.type, DoubleType) assert type_var.key.field_id == 1 assert isinstance(type_var.value.type, UUIDType) assert type_var.value.field_id == 2 assert str(type_var) == str(eval(repr(type_var))) assert type_var == eval(repr(type_var)) - assert type_var != MapType( - NestedField(True, 1, "optional_field", LongType()), - NestedField(False, 2, "required_field", UUIDType()), - ) - assert type_var != MapType( - NestedField(True, 1, "optional_field", DoubleType()), - NestedField(False, 2, "required_field", StringType()), - ) + assert type_var != MapType(1, LongType(), 2, UUIDType(), False) + assert type_var != MapType(1, DoubleType(), 2, StringType(), True) def test_nested_field(): field_var = NestedField( - True, 1, "optional_field1", StructType( - [ - NestedField( - True, - 2, - "optional_field2", - ListType(NestedField(False, 3, "required_field3", DoubleType())), - ), - NestedField( - False, - 4, - "required_field4", - MapType( - NestedField(True, 5, "optional_field5", TimeType()), - NestedField(False, 6, "required_field6", UUIDType()), - ), + NestedField( + 2, + "optional_field2", + ListType( + 3, + DoubleType(), + element_is_optional=False, ), - ] + is_optional=True, + ), ), + is_optional=True, ) assert field_var.is_optional assert not field_var.is_required