From b11cdb54b1a05cce0ade34af4ce81a94c34b2650 Mon Sep 17 00:00:00 2001 From: Sung Yun <107272191+syun64@users.noreply.github.com> Date: Fri, 12 Jul 2024 16:45:04 -0400 Subject: [PATCH] Deprecate to_requested_schema (#918) * deprecate to_requested_schema * prep for release --- mkdocs/docs/how-to-release.md | 15 +++++++++++++++ pyiceberg/io/pyarrow.py | 20 ++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/mkdocs/docs/how-to-release.md b/mkdocs/docs/how-to-release.md index 99baec25ac..4824cb9994 100644 --- a/mkdocs/docs/how-to-release.md +++ b/mkdocs/docs/how-to-release.md @@ -23,6 +23,21 @@ The guide to release PyIceberg. The first step is to publish a release candidate (RC) and publish it to the public for testing and validation. Once the vote has passed on the RC, the RC turns into the new release. +## Preparing for a release + +Before running the release candidate, we want to remove any APIs that were marked for removal under the @deprecated tag for this release. + +For example, the API with the following deprecation tag should be removed when preparing for the 0.2.0 release. + +```python + +@deprecated( + deprecated_in="0.1.0", + removed_in="0.2.0", + help_message="Please use load_something_else() instead", +) +``` + ## Running a release candidate Make sure that the version is correct in `pyproject.toml` and `pyiceberg/__init__.py`. Correct means that it reflects the version that you want to release. diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index 1ef9fc9b68..199133f794 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -156,6 +156,7 @@ from pyiceberg.utils.concurrent import ExecutorFactory from pyiceberg.utils.config import Config from pyiceberg.utils.datetime import millis_to_datetime +from pyiceberg.utils.deprecated import deprecated from pyiceberg.utils.singleton import Singleton from pyiceberg.utils.truncate import truncate_upper_bound_binary_string, truncate_upper_bound_text_string @@ -1279,6 +1280,23 @@ def project_batches( total_row_count += len(batch) +@deprecated( + deprecated_in="0.7.0", + removed_in="0.8.0", + help_message="The public API for 'to_requested_schema' is deprecated and is replaced by '_to_requested_schema'", +) +def to_requested_schema(requested_schema: Schema, file_schema: Schema, table: pa.Table) -> pa.Table: + struct_array = visit_with_partner(requested_schema, table, ArrowProjectionVisitor(file_schema), ArrowAccessor(file_schema)) + + arrays = [] + fields = [] + for pos, field in enumerate(requested_schema.fields): + array = struct_array.field(pos) + arrays.append(array) + fields.append(pa.field(field.name, array.type, field.optional)) + return pa.Table.from_arrays(arrays, schema=pa.schema(fields)) + + def _to_requested_schema( requested_schema: Schema, file_schema: Schema, @@ -1434,6 +1452,8 @@ def field_partner(self, partner_struct: Optional[pa.Array], field_id: int, _: st if isinstance(partner_struct, pa.StructArray): return partner_struct.field(name) + elif isinstance(partner_struct, pa.Table): + return partner_struct.column(name).combine_chunks() elif isinstance(partner_struct, pa.RecordBatch): return partner_struct.column(name) else: