From a7794cad8e09c43f26e7be627c4e999d330c6cec Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Wed, 7 Feb 2024 14:30:15 +0100 Subject: [PATCH] Arrow: Support Arrow large-string (#382) --- pyiceberg/io/pyarrow.py | 2 +- tests/io/test_pyarrow_visitor.py | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index 91d8452eab..904fab2e28 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -864,7 +864,7 @@ def primitive(self, primitive: pa.DataType) -> PrimitiveType: elif isinstance(primitive, pa.Decimal128Type): primitive = cast(pa.Decimal128Type, primitive) return DecimalType(primitive.precision, primitive.scale) - elif pa.types.is_string(primitive): + elif pa.types.is_string(primitive) or pa.types.is_large_string(primitive): return StringType() elif pa.types.is_date32(primitive): return DateType() diff --git a/tests/io/test_pyarrow_visitor.py b/tests/io/test_pyarrow_visitor.py index c7f364b920..c30a53a4d4 100644 --- a/tests/io/test_pyarrow_visitor.py +++ b/tests/io/test_pyarrow_visitor.py @@ -272,6 +272,15 @@ def test_round_schema_conversion_nested(table_schema_nested: Schema) -> None: assert actual == expected +def test_round_schema_large_string() -> None: + schema = pa.schema([pa.field("animals", pa.large_string())]) + actual = str(pyarrow_to_schema(schema, name_mapping=NameMapping([MappedField(field_id=1, names=["animals"])]))) + expected = """table { + 1: animals: optional string +}""" + assert actual == expected + + def test_simple_schema_has_missing_ids() -> None: schema = pa.schema([ pa.field('foo', pa.string(), nullable=False),