diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index db6742a8cd..afdabf6fce 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -36,13 +36,13 @@ repos: - id: ruff-format args: [ --preview ] - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.6.1 + rev: v1.8.0 hooks: - id: mypy args: [--install-types, --non-interactive, --config=pyproject.toml] - repo: https://github.com/hadialqattan/pycln - rev: v2.3.0 + rev: v2.4.0 hooks: - id: pycln args: [--config=pyproject.toml] diff --git a/pyiceberg/avro/decoder_fast.pyi b/pyiceberg/avro/decoder_fast.pyi index cf45ce5066..cc367c4764 100644 --- a/pyiceberg/avro/decoder_fast.pyi +++ b/pyiceberg/avro/decoder_fast.pyi @@ -20,37 +20,54 @@ from pyiceberg.avro.decoder import BinaryDecoder class CythonBinaryDecoder(BinaryDecoder): def __init__(self, input_contents: bytes) -> None: pass + def tell(self) -> int: pass + def read(self, n: int) -> bytes: pass + def read_boolean(self) -> bool: pass + def read_int(self) -> int: pass + def read_ints(self, count: int) -> tuple[int, ...]: pass + def read_int_bytes_dict(self, count: int, dest: dict[int, bytes]) -> None: pass + def read_bytes(self) -> bytes: pass + def read_float(self) -> float: pass + def read_double(self) -> float: pass + def read_utf8(self) -> str: pass + def skip(self, n: int) -> None: pass + def skip_int(self) -> None: pass + def skip_boolean(self) -> None: pass + def skip_float(self) -> None: pass + def skip_double(self) -> None: pass + def skip_bytes(self) -> None: pass + def skip_utf8(self) -> None: pass diff --git a/pyiceberg/avro/decoder_fast.pyx b/pyiceberg/avro/decoder_fast.pyx index 182fd0e92e..52caec3308 100644 --- a/pyiceberg/avro/decoder_fast.pyx +++ b/pyiceberg/avro/decoder_fast.pyx @@ -32,9 +32,7 @@ unsigned_long_long_array_template = cython.declare(array.array, array.array('Q', @cython.final cdef class CythonBinaryDecoder: - """Implement a BinaryDecoder that reads from an in-memory buffer. - - """ + """Implement a BinaryDecoder that reads from an in-memory buffer.""" # This the data that is duplicated when the decoder is created. cdef unsigned char *_data diff --git a/pyiceberg/catalog/__init__.py b/pyiceberg/catalog/__init__.py index 69e1c47d4b..f4361fb5d6 100644 --- a/pyiceberg/catalog/__init__.py +++ b/pyiceberg/catalog/__init__.py @@ -416,6 +416,8 @@ def _commit_table(self, table_request: CommitTableRequest) -> CommitTableRespons Raises: NoSuchTableError: If a table with the given identifier does not exist. + CommitFailedException: Requirement not met, or a conflict with a concurrent commit. + CommitStateUnknownException: Failed due to an internal exception on the side of the catalog. """ @abstractmethod diff --git a/pyiceberg/catalog/dynamodb.py b/pyiceberg/catalog/dynamodb.py index d5f3b5e14c..b7b0f3ddb1 100644 --- a/pyiceberg/catalog/dynamodb.py +++ b/pyiceberg/catalog/dynamodb.py @@ -208,6 +208,7 @@ def _commit_table(self, table_request: CommitTableRequest) -> CommitTableRespons Raises: NoSuchTableError: If a table with the given identifier does not exist. + CommitFailedException: Requirement not met, or a conflict with a concurrent commit. """ raise NotImplementedError diff --git a/pyiceberg/catalog/glue.py b/pyiceberg/catalog/glue.py index 06484cb0e4..089a30ba61 100644 --- a/pyiceberg/catalog/glue.py +++ b/pyiceberg/catalog/glue.py @@ -404,7 +404,7 @@ def _commit_table(self, table_request: CommitTableRequest) -> CommitTableRespons Raises: NoSuchTableError: If a table with the given identifier does not exist. - CommitFailedException: If the commit failed. + CommitFailedException: Requirement not met, or a conflict with a concurrent commit. """ identifier_tuple = self.identifier_to_tuple_without_catalog( tuple(table_request.identifier.namespace.root + [table_request.identifier.name]) diff --git a/pyiceberg/catalog/hive.py b/pyiceberg/catalog/hive.py index aba3c173e6..4d4370fc46 100644 --- a/pyiceberg/catalog/hive.py +++ b/pyiceberg/catalog/hive.py @@ -360,6 +360,7 @@ def _commit_table(self, table_request: CommitTableRequest) -> CommitTableRespons Raises: NoSuchTableError: If a table with the given identifier does not exist. + CommitFailedException: Requirement not met, or a conflict with a concurrent commit. """ identifier_tuple = self.identifier_to_tuple_without_catalog( tuple(table_request.identifier.namespace.root + [table_request.identifier.name]) diff --git a/pyiceberg/catalog/rest.py b/pyiceberg/catalog/rest.py index bfce820185..20d170064e 100644 --- a/pyiceberg/catalog/rest.py +++ b/pyiceberg/catalog/rest.py @@ -116,7 +116,7 @@ class Endpoints: SIGV4_SERVICE = "rest.signing-name" AUTH_URL = "rest.authorization-url" -NAMESPACE_SEPARATOR = b"\x1F".decode(UTF8) +NAMESPACE_SEPARATOR = b"\x1f".decode(UTF8) def _retry_hook(retry_state: RetryCallState) -> None: @@ -596,6 +596,8 @@ def _commit_table(self, table_request: CommitTableRequest) -> CommitTableRespons Raises: NoSuchTableError: If a table with the given identifier does not exist. + CommitFailedException: Requirement not met, or a conflict with a concurrent commit. + CommitStateUnknownException: Failed due to an internal exception on the side of the catalog. """ response = self._session.post( self.url(Endpoints.update_table, prefixed=True, **self._split_identifier_for_path(table_request.identifier)), diff --git a/pyiceberg/catalog/sql.py b/pyiceberg/catalog/sql.py index 62a2dac54a..0059da6676 100644 --- a/pyiceberg/catalog/sql.py +++ b/pyiceberg/catalog/sql.py @@ -373,7 +373,7 @@ def _commit_table(self, table_request: CommitTableRequest) -> CommitTableRespons Raises: NoSuchTableError: If a table with the given identifier does not exist. - CommitFailedException: If the commit failed. + CommitFailedException: Requirement not met, or a conflict with a concurrent commit. """ identifier_tuple = self.identifier_to_tuple_without_catalog( tuple(table_request.identifier.namespace.root + [table_request.identifier.name]) diff --git a/pyiceberg/conversions.py b/pyiceberg/conversions.py index b523deff48..2a03a4de35 100644 --- a/pyiceberg/conversions.py +++ b/pyiceberg/conversions.py @@ -267,7 +267,7 @@ def _(primitive_type: DecimalType, value: Decimal) -> bytes: return decimal_to_bytes(value) -@singledispatch +@singledispatch # type: ignore def from_bytes(primitive_type: PrimitiveType, b: bytes) -> L: # type: ignore """Convert bytes to a built-in python value. diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index f391abfea2..76bb041195 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -690,7 +690,9 @@ def _(obj: pa.StructType, visitor: PyArrowSchemaVisitor[T]) -> T: @visit_pyarrow.register(pa.ListType) -def _(obj: pa.ListType, visitor: PyArrowSchemaVisitor[T]) -> T: +@visit_pyarrow.register(pa.FixedSizeListType) +@visit_pyarrow.register(pa.LargeListType) +def _(obj: Union[pa.ListType, pa.LargeListType, pa.FixedSizeListType], visitor: PyArrowSchemaVisitor[T]) -> T: visitor.before_list_element(obj.value_field) result = visit_pyarrow(obj.value_type, visitor) visitor.after_list_element(obj.value_field) diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 9d7ca80da2..ba62e36117 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -1695,7 +1695,7 @@ def rename_column(self, path_from: Union[str, Tuple[str, ...]], new_name: str) - from_field_correct_casing = self._schema.find_column_name(field_from.field_id) if from_field_correct_casing in self._identifier_field_names: self._identifier_field_names.remove(from_field_correct_casing) - new_identifier_path = f"{from_field_correct_casing[:-len(field_from.name)]}{new_name}" + new_identifier_path = f"{from_field_correct_casing[: -len(field_from.name)]}{new_name}" self._identifier_field_names.add(new_identifier_path) return self diff --git a/pyproject.toml b/pyproject.toml index 71c7317dd2..52c60d9482 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -312,7 +312,7 @@ source = ['pyiceberg/'] [tool.ruff] src = ['pyiceberg','tests'] extend-exclude = ["dev/provision.py"] -select = [ +lint.select = [ "E", # pycodestyle "W", # pycodestyle "F", # Pyflakes @@ -322,11 +322,11 @@ select = [ "I", # isort "UP", # pyupgrade ] -ignore = ["E501","E203","B024","B028","UP037"] +lint.ignore = ["E501","E203","B024","B028","UP037"] # Allow autofix for all enabled rules (when `--fix`) is provided. -fixable = ["ALL"] -unfixable = [] +lint.fixable = ["ALL"] +lint.unfixable = [] # Exclude a variety of commonly ignored directories. exclude = [ @@ -352,19 +352,19 @@ exclude = [ "node_modules", "venv", ] -per-file-ignores = {} +lint.per-file-ignores = {} # Ignore _all_ violations. # Same as Black. line-length = 130 # Allow unused variables when underscore-prefixed. -dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" +lint.dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" -[tool.ruff.pyupgrade] +[tool.ruff.lint.pyupgrade] # Preserve types, even if a file imports `from __future__ import annotations`. keep-runtime-typing = true -[tool.ruff.isort] +[tool.ruff.lint.isort] detect-same-package = true lines-between-types = 0 known-first-party = ["pyiceberg", "tests"] diff --git a/tests/avro/test_decoder.py b/tests/avro/test_decoder.py index bbcc7394f4..608e6ae2d5 100644 --- a/tests/avro/test_decoder.py +++ b/tests/avro/test_decoder.py @@ -144,13 +144,13 @@ def test_read_single_byte_at_the_time(decoder_class: Callable[[bytes], BinaryDec @pytest.mark.parametrize("decoder_class", AVAILABLE_DECODERS) def test_read_float(decoder_class: Callable[[bytes], BinaryDecoder]) -> None: - decoder = decoder_class(b"\x00\x00\x9A\x41") + decoder = decoder_class(b"\x00\x00\x9a\x41") assert decoder.read_float() == 19.25 @pytest.mark.parametrize("decoder_class", AVAILABLE_DECODERS) def test_skip_float(decoder_class: Callable[[bytes], BinaryDecoder]) -> None: - decoder = decoder_class(b"\x00\x00\x9A\x41") + decoder = decoder_class(b"\x00\x00\x9a\x41") assert decoder.tell() == 0 decoder.skip_float() assert decoder.tell() == 4 @@ -179,13 +179,13 @@ def test_read_bytes(decoder_class: Callable[[bytes], BinaryDecoder]) -> None: @pytest.mark.parametrize("decoder_class", AVAILABLE_DECODERS) def test_read_utf8(decoder_class: Callable[[bytes], BinaryDecoder]) -> None: - decoder = decoder_class(b"\x04\x76\x6F") + decoder = decoder_class(b"\x04\x76\x6f") assert decoder.read_utf8() == "vo" @pytest.mark.parametrize("decoder_class", AVAILABLE_DECODERS) def test_skip_utf8(decoder_class: Callable[[bytes], BinaryDecoder]) -> None: - decoder = decoder_class(b"\x04\x76\x6F") + decoder = decoder_class(b"\x04\x76\x6f") assert decoder.tell() == 0 decoder.skip_utf8() assert decoder.tell() == 3 @@ -193,7 +193,7 @@ def test_skip_utf8(decoder_class: Callable[[bytes], BinaryDecoder]) -> None: @pytest.mark.parametrize("decoder_class", AVAILABLE_DECODERS) def test_read_int_as_float(decoder_class: Callable[[bytes], BinaryDecoder]) -> None: - decoder = decoder_class(b"\x00\x00\x9A\x41") + decoder = decoder_class(b"\x00\x00\x9a\x41") reader = resolve_reader(FloatType(), DoubleType()) assert reader.read(decoder) == 19.25 diff --git a/tests/io/test_pyarrow_visitor.py b/tests/io/test_pyarrow_visitor.py index 7d35cae424..5b55bd61b6 100644 --- a/tests/io/test_pyarrow_visitor.py +++ b/tests/io/test_pyarrow_visitor.py @@ -245,6 +245,26 @@ def test_pyarrow_list_to_iceberg() -> None: assert visit_pyarrow(pyarrow_list, _ConvertToIceberg()) == expected +def test_pyarrow_large_list_to_iceberg() -> None: + pyarrow_list = pa.large_list(pa.field("element", pa.int32(), nullable=False, metadata={"PARQUET:field_id": "1"})) + expected = ListType( + element_id=1, + element_type=IntegerType(), + element_required=True, + ) + assert visit_pyarrow(pyarrow_list, _ConvertToIceberg()) == expected + + +def test_pyarrow_fixed_size_list_to_iceberg() -> None: + pyarrow_list = pa.list_(pa.field("element", pa.int32(), nullable=False, metadata={"PARQUET:field_id": "1"}), 1) + expected = ListType( + element_id=1, + element_type=IntegerType(), + element_required=True, + ) + assert visit_pyarrow(pyarrow_list, _ConvertToIceberg()) == expected + + def test_pyarrow_map_to_iceberg() -> None: pyarrow_map = pa.map_( pa.field("key", pa.int32(), nullable=False, metadata={"PARQUET:field_id": "1"}),