From 6ca85ac866e0d39e448e833fffb358e6ef5d6824 Mon Sep 17 00:00:00 2001 From: Felix Scherz Date: Wed, 17 Jul 2024 08:23:29 +0200 Subject: [PATCH] chore: merge latest commits on main commit 1ed3abdd1aec480911eeec4f0f46a04efe53dc06 Author: Sung Yun <107272191+syun64@users.noreply.github.com> Date: Wed Jul 17 02:04:52 2024 -0400 Allow writing `pa.Table` that are either a subset of table schema or in arbitrary order, and support type promotion on write (#921) * merge * thanks @HonahX :) Co-authored-by: Honah J. * support promote * revert promote * use a visitor * support promotion on write * fix * Thank you @Fokko ! Co-authored-by: Fokko Driesprong * revert * add-files promotiontest * support promote for add_files * add tests for uuid * add_files subset schema test --------- Co-authored-by: Honah J. Co-authored-by: Fokko Driesprong commit 0f2e19e49eb8b859cfcd7f89cd182461a61f15a7 Author: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon Jul 15 23:25:08 2024 -0700 Bump zstandard from 0.22.0 to 0.23.0 (#934) Bumps [zstandard](https://github.com/indygreg/python-zstandard) from 0.22.0 to 0.23.0. - [Release notes](https://github.com/indygreg/python-zstandard/releases) - [Changelog](https://github.com/indygreg/python-zstandard/blob/main/docs/news.rst) - [Commits](https://github.com/indygreg/python-zstandard/compare/0.22.0...0.23.0) --- updated-dependencies: - dependency-name: zstandard dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> commit ec73d972486c0cfc45a936b2d260f354d424d9c1 Author: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon Jul 15 23:24:47 2024 -0700 Bump griffe from 0.47.0 to 0.48.0 (#933) Bumps [griffe](https://github.com/mkdocstrings/griffe) from 0.47.0 to 0.48.0. - [Release notes](https://github.com/mkdocstrings/griffe/releases) - [Changelog](https://github.com/mkdocstrings/griffe/blob/main/CHANGELOG.md) - [Commits](https://github.com/mkdocstrings/griffe/compare/0.47.0...0.48.0) --- updated-dependencies: - dependency-name: griffe dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> commit d05a4236cd0760c80d8125819359c957f33115e4 Author: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon Jul 15 23:24:16 2024 -0700 Bump mkdocs-material from 9.5.28 to 9.5.29 (#932) Bumps [mkdocs-material](https://github.com/squidfunk/mkdocs-material) from 9.5.28 to 9.5.29. - [Release notes](https://github.com/squidfunk/mkdocs-material/releases) - [Changelog](https://github.com/squidfunk/mkdocs-material/blob/master/CHANGELOG) - [Commits](https://github.com/squidfunk/mkdocs-material/compare/9.5.28...9.5.29) --- updated-dependencies: - dependency-name: mkdocs-material dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> commit e27cd9095503cfe9fa7e0a806ba25d42920c68c5 Author: Yair Halevi (Spock) <118175475+spock-abadai@users.noreply.github.com> Date: Sun Jul 14 22:11:04 2024 +0300 Allow empty `names` in mapped field of Name Mapping (#927) * Remove check_at_least_one field validator Iceberg spec permits an emtpy list of names in the default name mapping. check_at_least_one is therefore unnecessary. * Remove irrelevant test case * Fixing pydantic model No longer requiring minimum length of names list to be 1. * Added test case for empty names in name mapping * Fixed formatting error commit 3f44dfe711e96beda6aa8622cf5b0baffa6eb0f2 Author: Soumya Ghosh Date: Sun Jul 14 00:35:38 2024 +0530 Lowercase bool values in table properties (#924) commit b11cdb54b1a05cce0ade34af4ce81a94c34b2650 Author: Sung Yun <107272191+syun64@users.noreply.github.com> Date: Fri Jul 12 16:45:04 2024 -0400 Deprecate to_requested_schema (#918) * deprecate to_requested_schema * prep for release commit a3dd531dc9bef65df19a4b1c14d289e92d060cb1 Author: Honah J Date: Fri Jul 12 13:14:40 2024 -0700 Glue endpoint config variable, continue #530 (#920) Co-authored-by: Seb Pretzer <24555985+sebpretzer@users.noreply.github.com> commit 32e8f88ebf8e45ae0a7f60a848ea44044a9564ef Author: Sung Yun <107272191+syun64@users.noreply.github.com> Date: Fri Jul 12 15:26:00 2024 -0400 support PyArrow timestamptz with Etc/UTC (#910) Co-authored-by: Fokko Driesprong commit f6d56e9865b4e7e59eb8da539a11014faeb4085b Author: Sung Yun <107272191+syun64@users.noreply.github.com> Date: Fri Jul 12 05:31:06 2024 -0400 fix invalidation logic (#911) commit 6488ad88eeb8a65fd0aa4e11071c829609e36424 Author: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu Jul 11 22:56:48 2024 -0700 Bump coverage from 7.5.4 to 7.6.0 (#917) Bumps [coverage](https://github.com/nedbat/coveragepy) from 7.5.4 to 7.6.0. - [Release notes](https://github.com/nedbat/coveragepy/releases) - [Changelog](https://github.com/nedbat/coveragepy/blob/master/CHANGES.rst) - [Commits](https://github.com/nedbat/coveragepy/compare/7.5.4...7.6.0) --- updated-dependencies: - dependency-name: coverage dependency-type: direct:development update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> commit dceedfac4ec072ee4da99bf02dc93c1d27be45a9 Author: Sung Yun <107272191+syun64@users.noreply.github.com> Date: Thu Jul 11 20:32:14 2024 -0400 Check if schema is compatible in `add_files` API (#907) Co-authored-by: Fokko Driesprong commit aceed2ad9898eaa8b0cdddd883a195e58af5d0e3 Author: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu Jul 11 15:52:06 2024 +0200 Bump mypy-boto3-glue from 1.34.136 to 1.34.143 (#912) Bumps [mypy-boto3-glue](https://github.com/youtype/mypy_boto3_builder) from 1.34.136 to 1.34.143. - [Release notes](https://github.com/youtype/mypy_boto3_builder/releases) - [Commits](https://github.com/youtype/mypy_boto3_builder/commits) --- updated-dependencies: - dependency-name: mypy-boto3-glue dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> commit 1b9b884e56f74c7b7d1802774317ee95d799c5f2 Author: Fokko Driesprong Date: Thu Jul 11 12:45:20 2024 +0200 PyArrow: Don't enforce the schema when reading/writing (#902) * PyArrow: Don't enforce the schema PyIceberg struggled with the different type of arrow, such as the `string` and `large_string`. They represent the same, but are different under the hood. My take is that we should hide these kind of details from the user as much as possible. Now we went down the road of passing in the Iceberg schema into Arrow, but when doing this, Iceberg has to decide if it is a large or non-large type. This PR removes passing down the schema in order to let Arrow decide unless: - The type should be evolved - In case of re-ordering, we reorder the original types * WIP * Reuse Table schema * Make linter happy * Squash some bugs * Thanks Sung! Co-authored-by: Sung Yun <107272191+syun64@users.noreply.github.com> * Moar code moar bugs * Remove the variables wrt file sizes * Linting * Go with large ones for now * Missed one there! --------- Co-authored-by: Sung Yun <107272191+syun64@users.noreply.github.com> commit 8f47dfd2a0f586d58aa29e165540706066ea5282 Author: Soumya Ghosh Date: Thu Jul 11 11:52:55 2024 +0530 Move determine_partitions and helper methods to io.pyarrow (#906) commit 5aa451d41c2e7a89032a75f2e2adea31e10af309 Author: Soumya Ghosh Date: Thu Jul 11 07:57:05 2024 +0530 Rename data_sequence_number to sequence_number in ManifestEntry (#900) commit 77a07c90b7ca05c5d915c5c02047807c76b5031e Author: Honah J Date: Wed Jul 10 03:56:13 2024 -0700 Support MergeAppend operations (#363) * add ListPacker + tests * add merge append * add merge_append * fix snapshot inheritance * test manifest file and entries * add doc * fix lint * change test name * address review comments * rename _MergingSnapshotProducer to _SnapshotProducer * fix a serious bug * update the doc * remove merge_append as public API * make default to false * add test description * fix merge conflict * fix snapshot_id issue commit 66b92ffddd888e1f1c1fc25fa2ab7a00b45a3ff6 Author: Fokko Driesprong Date: Wed Jul 10 10:09:20 2024 +0200 GCS: Fix incorrect token description (#909) commit c25e0800240d13ff7295c61b0ea038a38980360d Author: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue Jul 9 20:50:29 2024 -0700 Bump zipp from 3.17.0 to 3.19.1 (#905) Bumps [zipp](https://github.com/jaraco/zipp) from 3.17.0 to 3.19.1. - [Release notes](https://github.com/jaraco/zipp/releases) - [Changelog](https://github.com/jaraco/zipp/blob/main/NEWS.rst) - [Commits](https://github.com/jaraco/zipp/compare/v3.17.0...v3.19.1) --- updated-dependencies: - dependency-name: zipp dependency-type: indirect ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> commit 301e336926d950f1d3424b8f87423504150368e0 Author: Sung Yun <107272191+syun64@users.noreply.github.com> Date: Tue Jul 9 23:35:11 2024 -0400 Cast 's', 'ms' and 'ns' PyArrow timestamp to 'us' precision on write (#848) commit 3f574d389b4b5cd17654638a40963eacf65563f1 Author: Fokko Driesprong Date: Tue Jul 9 11:36:43 2024 +0200 Support partial deletes (#569) * Add option to delete datafiles This is done through the Iceberg metadata, resulting in efficient deletes if the data is partitioned correctly * Pull in main * WIP * Change DataScan to accept Metadata and io For the partial deletes I want to do a scan on in memory metadata. Changing this API allows this. * fix name-mapping issue * WIP * WIP * Moar tests * Oops * Cleanup * WIP * WIP * Fix summary generation * Last few bits * Fix the requirement * Make ruff happy * Comments, thanks Kevin! * Comments * Append rather than truncate * Fix merge conflicts * Make the tests pass * Add another test * Conflicts * Add docs (#33) * docs * docs * Add a partitioned overwrite test * Fix comment * Skip empty manifests --------- Co-authored-by: HonahX Co-authored-by: Sung Yun <107272191+syun64@users.noreply.github.com> commit cdc3e54aad23638a3411fb4a771a2901e5bf8a93 Author: Fokko Driesprong Date: Tue Jul 9 08:28:27 2024 +0200 Disallow writing empty Manifest files (#876) * Disallow writing empty Avro files/blocks Raising an exception when doing this might look extreme, but there is no real good reason to allow this. * Relax the constaints a bit commit b68e1097a1c3c42d8739140179a1a07230c70c73 Author: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon Jul 8 22:16:23 2024 -0700 Bump fastavro from 1.9.4 to 1.9.5 (#904) Bumps [fastavro](https://github.com/fastavro/fastavro) from 1.9.4 to 1.9.5. - [Release notes](https://github.com/fastavro/fastavro/releases) - [Changelog](https://github.com/fastavro/fastavro/blob/master/ChangeLog) - [Commits](https://github.com/fastavro/fastavro/compare/1.9.4...1.9.5) --- updated-dependencies: - dependency-name: fastavro dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> commit 90547bb6a44b8b216fe974f746c33af36def635d Author: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon Jul 8 22:15:39 2024 -0700 Bump moto from 5.0.10 to 5.0.11 (#903) Bumps [moto](https://github.com/getmoto/moto) from 5.0.10 to 5.0.11. - [Release notes](https://github.com/getmoto/moto/releases) - [Changelog](https://github.com/getmoto/moto/blob/master/CHANGELOG.md) - [Commits](https://github.com/getmoto/moto/compare/5.0.10...5.0.11) --- updated-dependencies: - dependency-name: moto dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> commit 7dff359e0515839fbe24fac2108dcb2d64694b7a Author: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun Jul 7 07:50:19 2024 +0200 Bump tenacity from 8.4.2 to 8.5.0 (#898) commit 4aa469e14b3a0fbaeb8fc6acd30e67eed84ee91b Author: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat Jul 6 22:30:59 2024 +0200 Bump certifi from 2024.2.2 to 2024.7.4 (#899) Bumps [certifi](https://github.com/certifi/python-certifi) from 2024.2.2 to 2024.7.4. - [Commits](https://github.com/certifi/python-certifi/compare/2024.02.02...2024.07.04) --- updated-dependencies: - dependency-name: certifi dependency-type: indirect ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> commit aa7ad78c943053de31e3fb4457697ddb1344eda6 Author: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat Jul 6 20:37:51 2024 +0200 Bump deptry from 0.16.1 to 0.16.2 (#897) Bumps [deptry](https://github.com/fpgmaas/deptry) from 0.16.1 to 0.16.2. - [Release notes](https://github.com/fpgmaas/deptry/releases) - [Changelog](https://github.com/fpgmaas/deptry/blob/main/CHANGELOG.md) - [Commits](https://github.com/fpgmaas/deptry/compare/0.16.1...0.16.2) --- updated-dependencies: - dependency-name: deptry dependency-type: direct:development update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- mkdocs/docs/api.md | 21 +- mkdocs/docs/configuration.md | 57 +- mkdocs/docs/how-to-release.md | 15 + mkdocs/requirements.txt | 4 +- poetry.lock | 384 ++++---- pyiceberg/catalog/__init__.py | 6 +- pyiceberg/catalog/glue.py | 6 +- pyiceberg/io/pyarrow.py | 399 +++++++-- pyiceberg/manifest.py | 101 ++- pyiceberg/schema.py | 100 +++ pyiceberg/table/__init__.py | 826 ++++++++++++------ pyiceberg/table/name_mapping.py | 14 +- pyiceberg/table/snapshots.py | 2 +- pyiceberg/types.py | 2 +- pyiceberg/utils/bin_packing.py | 26 + pyproject.toml | 2 +- tests/avro/test_file.py | 10 +- tests/catalog/integration_test_glue.py | 11 +- tests/catalog/test_glue.py | 10 + tests/catalog/test_sql.py | 35 +- tests/cli/test_console.py | 9 +- tests/conftest.py | 212 ++++- tests/integration/test_add_files.py | 300 ++++++- tests/integration/test_deletes.py | 419 +++++++++ tests/integration/test_inspect_table.py | 25 +- tests/integration/test_rest_schema.py | 18 +- .../test_writes/test_partitioned_writes.py | 46 +- tests/integration/test_writes/test_writes.py | 453 +++++++++- tests/io/test_pyarrow.py | 314 ++++++- tests/io/test_pyarrow_visitor.py | 36 +- tests/table/test_init.py | 176 ---- tests/table/test_name_mapping.py | 22 +- tests/table/test_snapshots.py | 4 - tests/test_types.py | 12 + tests/utils/test_bin_packing.py | 46 +- tests/utils/test_manifest.py | 29 +- 36 files changed, 3314 insertions(+), 838 deletions(-) create mode 100644 tests/integration/test_deletes.py diff --git a/mkdocs/docs/api.md b/mkdocs/docs/api.md index 0e80b6eb5e..7386d0297a 100644 --- a/mkdocs/docs/api.md +++ b/mkdocs/docs/api.md @@ -331,12 +331,25 @@ df = pa.Table.from_pylist( table.append(df) ``` - +You can delete some of the data from the table by calling `tbl.delete()` with a desired `delete_filter`. + +```python +tbl.delete(delete_filter="city == 'Paris'") +``` -!!! example "Under development" - Writing using PyIceberg is still under development. Support for [partial overwrites](https://github.com/apache/iceberg-python/issues/268) and writing to [partitioned tables](https://github.com/apache/iceberg-python/issues/208) is planned and being worked on. +In the above example, any records where the city field value equals to `Paris` will be deleted. +Running `tbl.scan().to_arrow()` will now yield: - +``` +pyarrow.Table +city: string +lat: double +long: double +---- +city: [["Amsterdam","San Francisco","Drachten"],["Groningen"]] +lat: [[52.371807,37.773972,53.11254],[53.21917]] +long: [[4.896029,-122.431297,6.0989],[6.56667]] +``` ## Inspecting tables diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index cf22f8e640..76e1816c3a 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -61,6 +61,21 @@ Iceberg tables support table properties to configure table behavior. | `write.parquet.dict-size-bytes` | Size in bytes | 2MB | Set the dictionary page size limit per row group | | `write.parquet.row-group-limit` | Number of rows | 122880 | The Parquet row group limit | +## Table behavior options + +| Key | Options | Default | Description | +| ------------------------------------ | ------------------- | ------------- | ----------------------------------------------------------- | +| `commit.manifest.target-size-bytes` | Size in bytes | 8388608 (8MB) | Target size when merging manifest files | +| `commit.manifest.min-count-to-merge` | Number of manifests | 100 | Target size when merging manifest files | +| `commit.manifest-merge.enabled` | Boolean | False | Controls whether to automatically merge manifests on writes | + + + +!!! note "Fast append" + Unlike Java implementation, PyIceberg default to the [fast append](api.md#write-support) and thus `commit.manifest-merge.enabled` is set to `False` by default. + + + # FileIO Iceberg works with the concept of a FileIO which is a pluggable module for reading, writing, and deleting files. By default, PyIceberg will try to initialize the FileIO that's suitable for the scheme (`s3://`, `gs://`, etc.) and will use the first one that's installed. @@ -129,19 +144,19 @@ For the FileIO there are several configuration options available: -| Key | Example | Description | -| --------------------------- | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| gcs.project-id | my-gcp-project | Configure Google Cloud Project for GCS FileIO. | -| gcs.oauth2.token | ya29.dr.AfM... | Configure method authentication to GCS for FileIO. Can be the following, 'google_default', 'cache', 'anon', 'browser', 'cloud'. If not specified your credentials will be resolved in the following order: gcloud CLI default, gcsfs cached token, google compute metadata service, anonymous. | -| gcs.oauth2.token-expires-at | 1690971805918 | Configure expiration for credential generated with an access token. Milliseconds since epoch | -| gcs.access | read_only | Configure client to have specific access. Must be one of 'read_only', 'read_write', or 'full_control' | -| gcs.consistency | md5 | Configure the check method when writing files. Must be one of 'none', 'size', or 'md5' | -| gcs.cache-timeout | 60 | Configure the cache expiration time in seconds for object metadata cache | -| gcs.requester-pays | False | Configure whether to use requester-pays requests | -| gcs.session-kwargs | {} | Configure a dict of parameters to pass on to aiohttp.ClientSession; can contain, for example, proxy settings. | -| gcs.endpoint | http://0.0.0.0:4443 | Configure an alternative endpoint for the GCS FileIO to access (format protocol://host:port) If not given, defaults to the value of environment variable "STORAGE_EMULATOR_HOST"; if that is not set either, will use the standard Google endpoint. | -| gcs.default-location | US | Configure the default location where buckets are created, like 'US' or 'EUROPE-WEST3'. | -| gcs.version-aware | False | Configure whether to support object versioning on the GCS bucket. | +| Key | Example | Description | +| --------------------------- | ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| gcs.project-id | my-gcp-project | Configure Google Cloud Project for GCS FileIO. | +| gcs.oauth2.token | ya29.dr.AfM... | String representation of the access token used for temporary access. | +| gcs.oauth2.token-expires-at | 1690971805918 | Configure expiration for credential generated with an access token. Milliseconds since epoch | +| gcs.access | read_only | Configure client to have specific access. Must be one of 'read_only', 'read_write', or 'full_control' | +| gcs.consistency | md5 | Configure the check method when writing files. Must be one of 'none', 'size', or 'md5' | +| gcs.cache-timeout | 60 | Configure the cache expiration time in seconds for object metadata cache | +| gcs.requester-pays | False | Configure whether to use requester-pays requests | +| gcs.session-kwargs | {} | Configure a dict of parameters to pass on to aiohttp.ClientSession; can contain, for example, proxy settings. | +| gcs.endpoint | http://0.0.0.0:4443 | Configure an alternative endpoint for the GCS FileIO to access (format protocol://host:port) If not given, defaults to the value of environment variable "STORAGE_EMULATOR_HOST"; if that is not set either, will use the standard Google endpoint. | +| gcs.default-location | US | Configure the default location where buckets are created, like 'US' or 'EUROPE-WEST3'. | +| gcs.version-aware | False | Configure whether to support object versioning on the GCS bucket. | @@ -273,6 +288,16 @@ catalog: region_name: ``` + + +| Key | Example | Description | +| ----------------- | ------------------------------------ | ------------------------------------------------------------------------------- | +| glue.id | 111111111111 | Configure the 12-digit ID of the Glue Catalog | +| glue.skip-archive | true | Configure whether to skip the archival of older table versions. Default to true | +| glue.endpoint | https://glue.us-east-1.amazonaws.com | Configure an alternative endpoint of the Glue service for GlueCatalog to access | + + + ## DynamoDB Catalog If you want to use AWS DynamoDB as the catalog, you can use the last two ways to configure the pyiceberg and refer @@ -305,4 +330,8 @@ PyIceberg uses multiple threads to parallelize operations. The number of workers # Backward Compatibility -Previous versions of Java (`<1.4.0`) implementations incorrectly assume the optional attribute `current-snapshot-id` to be a required attribute in TableMetadata. This means that if `current-snapshot-id` is missing in the metadata file (e.g. on table creation), the application will throw an exception without being able to load the table. This assumption has been corrected in more recent Iceberg versions. However, it is possible to force PyIceberg to create a table with a metadata file that will be compatible with previous versions. This can be configured by setting the `legacy-current-snapshot-id` entry as "True" in the configuration file, or by setting the `PYICEBERG_LEGACY_CURRENT_SNAPSHOT_ID` environment variable. Refer to the [PR discussion](https://github.com/apache/iceberg-python/pull/473) for more details on the issue +Previous versions of Java (`<1.4.0`) implementations incorrectly assume the optional attribute `current-snapshot-id` to be a required attribute in TableMetadata. This means that if `current-snapshot-id` is missing in the metadata file (e.g. on table creation), the application will throw an exception without being able to load the table. This assumption has been corrected in more recent Iceberg versions. However, it is possible to force PyIceberg to create a table with a metadata file that will be compatible with previous versions. This can be configured by setting the `legacy-current-snapshot-id` property as "True" in the configuration file, or by setting the `PYICEBERG_LEGACY_CURRENT_SNAPSHOT_ID` environment variable. Refer to the [PR discussion](https://github.com/apache/iceberg-python/pull/473) for more details on the issue + +# Nanoseconds Support + +PyIceberg currently only supports upto microsecond precision in its TimestampType. PyArrow timestamp types in 's' and 'ms' will be upcast automatically to 'us' precision timestamps on write. Timestamps in 'ns' precision can also be downcast automatically on write if desired. This can be configured by setting the `downcast-ns-timestamp-to-us-on-write` property as "True" in the configuration file, or by setting the `PYICEBERG_DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE` environment variable. Refer to the [nanoseconds timestamp proposal document](https://docs.google.com/document/d/1bE1DcEGNzZAMiVJSZ0X1wElKLNkT9kRkk0hDlfkXzvU/edit#heading=h.ibflcctc9i1d) for more details on the long term roadmap for nanoseconds support diff --git a/mkdocs/docs/how-to-release.md b/mkdocs/docs/how-to-release.md index 99baec25ac..4824cb9994 100644 --- a/mkdocs/docs/how-to-release.md +++ b/mkdocs/docs/how-to-release.md @@ -23,6 +23,21 @@ The guide to release PyIceberg. The first step is to publish a release candidate (RC) and publish it to the public for testing and validation. Once the vote has passed on the RC, the RC turns into the new release. +## Preparing for a release + +Before running the release candidate, we want to remove any APIs that were marked for removal under the @deprecated tag for this release. + +For example, the API with the following deprecation tag should be removed when preparing for the 0.2.0 release. + +```python + +@deprecated( + deprecated_in="0.1.0", + removed_in="0.2.0", + help_message="Please use load_something_else() instead", +) +``` + ## Running a release candidate Make sure that the version is correct in `pyproject.toml` and `pyiceberg/__init__.py`. Correct means that it reflects the version that you want to release. diff --git a/mkdocs/requirements.txt b/mkdocs/requirements.txt index eeee2095a2..f1938f7428 100644 --- a/mkdocs/requirements.txt +++ b/mkdocs/requirements.txt @@ -16,13 +16,13 @@ # under the License. mkdocs==1.6.0 -griffe==0.47.0 +griffe==0.48.0 jinja2==3.1.4 mkdocstrings==0.25.1 mkdocstrings-python==1.10.5 mkdocs-literate-nav==0.6.1 mkdocs-autorefs==1.0.1 mkdocs-gen-files==0.5.0 -mkdocs-material==9.5.28 +mkdocs-material==9.5.29 mkdocs-material-extensions==1.3.1 mkdocs-section-index==0.3.9 diff --git a/poetry.lock b/poetry.lock index e77743dfdc..f114cc7e57 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "adlfs" @@ -375,8 +375,8 @@ files = [ jmespath = ">=0.7.1,<2.0.0" python-dateutil = ">=2.1,<3.0.0" urllib3 = [ - {version = ">=1.25.4,<2.2.0 || >2.2.0,<3", markers = "python_version >= \"3.10\""}, {version = ">=1.25.4,<1.27", markers = "python_version < \"3.10\""}, + {version = ">=1.25.4,<2.2.0 || >2.2.0,<3", markers = "python_version >= \"3.10\""}, ] [package.extras] @@ -419,13 +419,13 @@ files = [ [[package]] name = "certifi" -version = "2024.2.2" +version = "2024.7.4" description = "Python package for providing Mozilla's CA Bundle." optional = false python-versions = ">=3.6" files = [ - {file = "certifi-2024.2.2-py3-none-any.whl", hash = "sha256:dc383c07b76109f368f6106eee2b593b04a011ea4d55f652c6ca24a754d1cdd1"}, - {file = "certifi-2024.2.2.tar.gz", hash = "sha256:0569859f95fc761b18b45ef421b1290a0f65f147e92a1e5eb3e635f9a5e4e66f"}, + {file = "certifi-2024.7.4-py3-none-any.whl", hash = "sha256:c198e21b1289c2ab85ee4e67bb4b4ef3ead0892059901a8d5b622f24a1101e90"}, + {file = "certifi-2024.7.4.tar.gz", hash = "sha256:5a1e7645bc0ec61a09e26c36f6106dd4cf40c6db3a1fb6352b0244e7fb057c7b"}, ] [[package]] @@ -652,63 +652,63 @@ files = [ [[package]] name = "coverage" -version = "7.5.4" +version = "7.6.0" description = "Code coverage measurement for Python" optional = false python-versions = ">=3.8" files = [ - {file = "coverage-7.5.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6cfb5a4f556bb51aba274588200a46e4dd6b505fb1a5f8c5ae408222eb416f99"}, - {file = "coverage-7.5.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2174e7c23e0a454ffe12267a10732c273243b4f2d50d07544a91198f05c48f47"}, - {file = "coverage-7.5.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2214ee920787d85db1b6a0bd9da5f8503ccc8fcd5814d90796c2f2493a2f4d2e"}, - {file = "coverage-7.5.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1137f46adb28e3813dec8c01fefadcb8c614f33576f672962e323b5128d9a68d"}, - {file = "coverage-7.5.4-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b385d49609f8e9efc885790a5a0e89f2e3ae042cdf12958b6034cc442de428d3"}, - {file = "coverage-7.5.4-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:b4a474f799456e0eb46d78ab07303286a84a3140e9700b9e154cfebc8f527016"}, - {file = "coverage-7.5.4-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:5cd64adedf3be66f8ccee418473c2916492d53cbafbfcff851cbec5a8454b136"}, - {file = "coverage-7.5.4-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:e564c2cf45d2f44a9da56f4e3a26b2236504a496eb4cb0ca7221cd4cc7a9aca9"}, - {file = "coverage-7.5.4-cp310-cp310-win32.whl", hash = "sha256:7076b4b3a5f6d2b5d7f1185fde25b1e54eb66e647a1dfef0e2c2bfaf9b4c88c8"}, - {file = "coverage-7.5.4-cp310-cp310-win_amd64.whl", hash = "sha256:018a12985185038a5b2bcafab04ab833a9a0f2c59995b3cec07e10074c78635f"}, - {file = "coverage-7.5.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:db14f552ac38f10758ad14dd7b983dbab424e731588d300c7db25b6f89e335b5"}, - {file = "coverage-7.5.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3257fdd8e574805f27bb5342b77bc65578e98cbc004a92232106344053f319ba"}, - {file = "coverage-7.5.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3a6612c99081d8d6134005b1354191e103ec9705d7ba2754e848211ac8cacc6b"}, - {file = "coverage-7.5.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d45d3cbd94159c468b9b8c5a556e3f6b81a8d1af2a92b77320e887c3e7a5d080"}, - {file = "coverage-7.5.4-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ed550e7442f278af76d9d65af48069f1fb84c9f745ae249c1a183c1e9d1b025c"}, - {file = "coverage-7.5.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:7a892be37ca35eb5019ec85402c3371b0f7cda5ab5056023a7f13da0961e60da"}, - {file = "coverage-7.5.4-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8192794d120167e2a64721d88dbd688584675e86e15d0569599257566dec9bf0"}, - {file = "coverage-7.5.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:820bc841faa502e727a48311948e0461132a9c8baa42f6b2b84a29ced24cc078"}, - {file = "coverage-7.5.4-cp311-cp311-win32.whl", hash = "sha256:6aae5cce399a0f065da65c7bb1e8abd5c7a3043da9dceb429ebe1b289bc07806"}, - {file = "coverage-7.5.4-cp311-cp311-win_amd64.whl", hash = "sha256:d2e344d6adc8ef81c5a233d3a57b3c7d5181f40e79e05e1c143da143ccb6377d"}, - {file = "coverage-7.5.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:54317c2b806354cbb2dc7ac27e2b93f97096912cc16b18289c5d4e44fc663233"}, - {file = "coverage-7.5.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:042183de01f8b6d531e10c197f7f0315a61e8d805ab29c5f7b51a01d62782747"}, - {file = "coverage-7.5.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a6bb74ed465d5fb204b2ec41d79bcd28afccf817de721e8a807d5141c3426638"}, - {file = "coverage-7.5.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b3d45ff86efb129c599a3b287ae2e44c1e281ae0f9a9bad0edc202179bcc3a2e"}, - {file = "coverage-7.5.4-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5013ed890dc917cef2c9f765c4c6a8ae9df983cd60dbb635df8ed9f4ebc9f555"}, - {file = "coverage-7.5.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1014fbf665fef86cdfd6cb5b7371496ce35e4d2a00cda501cf9f5b9e6fced69f"}, - {file = "coverage-7.5.4-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:3684bc2ff328f935981847082ba4fdc950d58906a40eafa93510d1b54c08a66c"}, - {file = "coverage-7.5.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:581ea96f92bf71a5ec0974001f900db495488434a6928a2ca7f01eee20c23805"}, - {file = "coverage-7.5.4-cp312-cp312-win32.whl", hash = "sha256:73ca8fbc5bc622e54627314c1a6f1dfdd8db69788f3443e752c215f29fa87a0b"}, - {file = "coverage-7.5.4-cp312-cp312-win_amd64.whl", hash = "sha256:cef4649ec906ea7ea5e9e796e68b987f83fa9a718514fe147f538cfeda76d7a7"}, - {file = "coverage-7.5.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cdd31315fc20868c194130de9ee6bfd99755cc9565edff98ecc12585b90be882"}, - {file = "coverage-7.5.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:02ff6e898197cc1e9fa375581382b72498eb2e6d5fc0b53f03e496cfee3fac6d"}, - {file = "coverage-7.5.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d05c16cf4b4c2fc880cb12ba4c9b526e9e5d5bb1d81313d4d732a5b9fe2b9d53"}, - {file = "coverage-7.5.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c5986ee7ea0795a4095ac4d113cbb3448601efca7f158ec7f7087a6c705304e4"}, - {file = "coverage-7.5.4-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5df54843b88901fdc2f598ac06737f03d71168fd1175728054c8f5a2739ac3e4"}, - {file = "coverage-7.5.4-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:ab73b35e8d109bffbda9a3e91c64e29fe26e03e49addf5b43d85fc426dde11f9"}, - {file = "coverage-7.5.4-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:aea072a941b033813f5e4814541fc265a5c12ed9720daef11ca516aeacd3bd7f"}, - {file = "coverage-7.5.4-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:16852febd96acd953b0d55fc842ce2dac1710f26729b31c80b940b9afcd9896f"}, - {file = "coverage-7.5.4-cp38-cp38-win32.whl", hash = "sha256:8f894208794b164e6bd4bba61fc98bf6b06be4d390cf2daacfa6eca0a6d2bb4f"}, - {file = "coverage-7.5.4-cp38-cp38-win_amd64.whl", hash = "sha256:e2afe743289273209c992075a5a4913e8d007d569a406ffed0bd080ea02b0633"}, - {file = "coverage-7.5.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b95c3a8cb0463ba9f77383d0fa8c9194cf91f64445a63fc26fb2327e1e1eb088"}, - {file = "coverage-7.5.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3d7564cc09dd91b5a6001754a5b3c6ecc4aba6323baf33a12bd751036c998be4"}, - {file = "coverage-7.5.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:44da56a2589b684813f86d07597fdf8a9c6ce77f58976727329272f5a01f99f7"}, - {file = "coverage-7.5.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e16f3d6b491c48c5ae726308e6ab1e18ee830b4cdd6913f2d7f77354b33f91c8"}, - {file = "coverage-7.5.4-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dbc5958cb471e5a5af41b0ddaea96a37e74ed289535e8deca404811f6cb0bc3d"}, - {file = "coverage-7.5.4-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:a04e990a2a41740b02d6182b498ee9796cf60eefe40cf859b016650147908029"}, - {file = "coverage-7.5.4-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:ddbd2f9713a79e8e7242d7c51f1929611e991d855f414ca9996c20e44a895f7c"}, - {file = "coverage-7.5.4-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:b1ccf5e728ccf83acd313c89f07c22d70d6c375a9c6f339233dcf792094bcbf7"}, - {file = "coverage-7.5.4-cp39-cp39-win32.whl", hash = "sha256:56b4eafa21c6c175b3ede004ca12c653a88b6f922494b023aeb1e836df953ace"}, - {file = "coverage-7.5.4-cp39-cp39-win_amd64.whl", hash = "sha256:65e528e2e921ba8fd67d9055e6b9f9e34b21ebd6768ae1c1723f4ea6ace1234d"}, - {file = "coverage-7.5.4-pp38.pp39.pp310-none-any.whl", hash = "sha256:79b356f3dd5b26f3ad23b35c75dbdaf1f9e2450b6bcefc6d0825ea0aa3f86ca5"}, - {file = "coverage-7.5.4.tar.gz", hash = "sha256:a44963520b069e12789d0faea4e9fdb1e410cdc4aab89d94f7f55cbb7fef0353"}, + {file = "coverage-7.6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:dff044f661f59dace805eedb4a7404c573b6ff0cdba4a524141bc63d7be5c7fd"}, + {file = "coverage-7.6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a8659fd33ee9e6ca03950cfdcdf271d645cf681609153f218826dd9805ab585c"}, + {file = "coverage-7.6.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7792f0ab20df8071d669d929c75c97fecfa6bcab82c10ee4adb91c7a54055463"}, + {file = "coverage-7.6.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d4b3cd1ca7cd73d229487fa5caca9e4bc1f0bca96526b922d61053ea751fe791"}, + {file = "coverage-7.6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e7e128f85c0b419907d1f38e616c4f1e9f1d1b37a7949f44df9a73d5da5cd53c"}, + {file = "coverage-7.6.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a94925102c89247530ae1dab7dc02c690942566f22e189cbd53579b0693c0783"}, + {file = "coverage-7.6.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:dcd070b5b585b50e6617e8972f3fbbee786afca71b1936ac06257f7e178f00f6"}, + {file = "coverage-7.6.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:d50a252b23b9b4dfeefc1f663c568a221092cbaded20a05a11665d0dbec9b8fb"}, + {file = "coverage-7.6.0-cp310-cp310-win32.whl", hash = "sha256:0e7b27d04131c46e6894f23a4ae186a6a2207209a05df5b6ad4caee6d54a222c"}, + {file = "coverage-7.6.0-cp310-cp310-win_amd64.whl", hash = "sha256:54dece71673b3187c86226c3ca793c5f891f9fc3d8aa183f2e3653da18566169"}, + {file = "coverage-7.6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c7b525ab52ce18c57ae232ba6f7010297a87ced82a2383b1afd238849c1ff933"}, + {file = "coverage-7.6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4bea27c4269234e06f621f3fac3925f56ff34bc14521484b8f66a580aacc2e7d"}, + {file = "coverage-7.6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ed8d1d1821ba5fc88d4a4f45387b65de52382fa3ef1f0115a4f7a20cdfab0e94"}, + {file = "coverage-7.6.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:01c322ef2bbe15057bc4bf132b525b7e3f7206f071799eb8aa6ad1940bcf5fb1"}, + {file = "coverage-7.6.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:03cafe82c1b32b770a29fd6de923625ccac3185a54a5e66606da26d105f37dac"}, + {file = "coverage-7.6.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0d1b923fc4a40c5832be4f35a5dab0e5ff89cddf83bb4174499e02ea089daf57"}, + {file = "coverage-7.6.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:4b03741e70fb811d1a9a1d75355cf391f274ed85847f4b78e35459899f57af4d"}, + {file = "coverage-7.6.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:a73d18625f6a8a1cbb11eadc1d03929f9510f4131879288e3f7922097a429f63"}, + {file = "coverage-7.6.0-cp311-cp311-win32.whl", hash = "sha256:65fa405b837060db569a61ec368b74688f429b32fa47a8929a7a2f9b47183713"}, + {file = "coverage-7.6.0-cp311-cp311-win_amd64.whl", hash = "sha256:6379688fb4cfa921ae349c76eb1a9ab26b65f32b03d46bb0eed841fd4cb6afb1"}, + {file = "coverage-7.6.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f7db0b6ae1f96ae41afe626095149ecd1b212b424626175a6633c2999eaad45b"}, + {file = "coverage-7.6.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:bbdf9a72403110a3bdae77948b8011f644571311c2fb35ee15f0f10a8fc082e8"}, + {file = "coverage-7.6.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9cc44bf0315268e253bf563f3560e6c004efe38f76db03a1558274a6e04bf5d5"}, + {file = "coverage-7.6.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:da8549d17489cd52f85a9829d0e1d91059359b3c54a26f28bec2c5d369524807"}, + {file = "coverage-7.6.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0086cd4fc71b7d485ac93ca4239c8f75732c2ae3ba83f6be1c9be59d9e2c6382"}, + {file = "coverage-7.6.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1fad32ee9b27350687035cb5fdf9145bc9cf0a094a9577d43e909948ebcfa27b"}, + {file = "coverage-7.6.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:044a0985a4f25b335882b0966625270a8d9db3d3409ddc49a4eb00b0ef5e8cee"}, + {file = "coverage-7.6.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:76d5f82213aa78098b9b964ea89de4617e70e0d43e97900c2778a50856dac605"}, + {file = "coverage-7.6.0-cp312-cp312-win32.whl", hash = "sha256:3c59105f8d58ce500f348c5b56163a4113a440dad6daa2294b5052a10db866da"}, + {file = "coverage-7.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:ca5d79cfdae420a1d52bf177de4bc2289c321d6c961ae321503b2ca59c17ae67"}, + {file = "coverage-7.6.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d39bd10f0ae453554798b125d2f39884290c480f56e8a02ba7a6ed552005243b"}, + {file = "coverage-7.6.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:beb08e8508e53a568811016e59f3234d29c2583f6b6e28572f0954a6b4f7e03d"}, + {file = "coverage-7.6.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b2e16f4cd2bc4d88ba30ca2d3bbf2f21f00f382cf4e1ce3b1ddc96c634bc48ca"}, + {file = "coverage-7.6.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6616d1c9bf1e3faea78711ee42a8b972367d82ceae233ec0ac61cc7fec09fa6b"}, + {file = "coverage-7.6.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad4567d6c334c46046d1c4c20024de2a1c3abc626817ae21ae3da600f5779b44"}, + {file = "coverage-7.6.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:d17c6a415d68cfe1091d3296ba5749d3d8696e42c37fca5d4860c5bf7b729f03"}, + {file = "coverage-7.6.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:9146579352d7b5f6412735d0f203bbd8d00113a680b66565e205bc605ef81bc6"}, + {file = "coverage-7.6.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:cdab02a0a941af190df8782aafc591ef3ad08824f97850b015c8c6a8b3877b0b"}, + {file = "coverage-7.6.0-cp38-cp38-win32.whl", hash = "sha256:df423f351b162a702c053d5dddc0fc0ef9a9e27ea3f449781ace5f906b664428"}, + {file = "coverage-7.6.0-cp38-cp38-win_amd64.whl", hash = "sha256:f2501d60d7497fd55e391f423f965bbe9e650e9ffc3c627d5f0ac516026000b8"}, + {file = "coverage-7.6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7221f9ac9dad9492cecab6f676b3eaf9185141539d5c9689d13fd6b0d7de840c"}, + {file = "coverage-7.6.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ddaaa91bfc4477d2871442bbf30a125e8fe6b05da8a0015507bfbf4718228ab2"}, + {file = "coverage-7.6.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c4cbe651f3904e28f3a55d6f371203049034b4ddbce65a54527a3f189ca3b390"}, + {file = "coverage-7.6.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:831b476d79408ab6ccfadaaf199906c833f02fdb32c9ab907b1d4aa0713cfa3b"}, + {file = "coverage-7.6.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:46c3d091059ad0b9c59d1034de74a7f36dcfa7f6d3bde782c49deb42438f2450"}, + {file = "coverage-7.6.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:4d5fae0a22dc86259dee66f2cc6c1d3e490c4a1214d7daa2a93d07491c5c04b6"}, + {file = "coverage-7.6.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:07ed352205574aad067482e53dd606926afebcb5590653121063fbf4e2175166"}, + {file = "coverage-7.6.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:49c76cdfa13015c4560702574bad67f0e15ca5a2872c6a125f6327ead2b731dd"}, + {file = "coverage-7.6.0-cp39-cp39-win32.whl", hash = "sha256:482855914928c8175735a2a59c8dc5806cf7d8f032e4820d52e845d1f731dca2"}, + {file = "coverage-7.6.0-cp39-cp39-win_amd64.whl", hash = "sha256:543ef9179bc55edfd895154a51792b01c017c87af0ebaae092720152e19e42ca"}, + {file = "coverage-7.6.0-pp38.pp39.pp310-none-any.whl", hash = "sha256:6fe885135c8a479d3e37a7aae61cbd3a0fb2deccb4dda3c25f92a49189f766d6"}, + {file = "coverage-7.6.0.tar.gz", hash = "sha256:289cc803fa1dc901f84701ac10c9ee873619320f2f9aff38794db4a4a0268d51"}, ] [package.dependencies] @@ -963,21 +963,22 @@ files = [ [[package]] name = "deptry" -version = "0.16.1" +version = "0.16.2" description = "A command line utility to check for unused, missing and transitive dependencies in a Python project." optional = false python-versions = ">=3.8" files = [ - {file = "deptry-0.16.1-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:29ed8ae61b8f5664dd484717c79eef7ec66d965940efd828fca0d3c09220a1db"}, - {file = "deptry-0.16.1-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:738a772b538f51e9a7bb8d5cb9a61cfea8794a79371d171919b01cff0dc895bf"}, - {file = "deptry-0.16.1-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:56b78f7c860def8000e93f88345a24809f1b91e2f7836ac9a08285cb405e2762"}, - {file = "deptry-0.16.1-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c3e86a04ea87ddece0f68ba204feb950f588205808c8320e6628300f03ff66dc"}, - {file = "deptry-0.16.1-cp38-abi3-win_amd64.whl", hash = "sha256:01b5098739a56c93f3e1e40efec5f20452f22a9a8436a59809d46201fcb94bcf"}, - {file = "deptry-0.16.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:7e29dc4c1bbb933c9482e8cef85fafe2be7f46aeb90a8a07ba5f2b22af60876f"}, - {file = "deptry-0.16.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8dfab68c247566c87a40f55f405be8549ffe4cea0b9b5384b7ae73a6f1d5cd1"}, - {file = "deptry-0.16.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1228493926b6e59cd2df7cb6016e10c255553cc31db24edcf7fc8d5474b81be6"}, - {file = "deptry-0.16.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:99c3ac60b78ad1b8fb9844c25393e7ebc969cc950601ce3c050f56d196da5a79"}, - {file = "deptry-0.16.1.tar.gz", hash = "sha256:39fb62da4a8f4d17ed282310f7bcaadec55a95a8c471b01e0fcdf5351a7ac323"}, + {file = "deptry-0.16.2-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:24bfbae07bd6533c852c795e8d88d05a8ad0801bec0d3662e1a37db763c52540"}, + {file = "deptry-0.16.2-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:fc881688a2eaeafe51c0617d32a6535057bccdb74559cc667109f48f81cd976e"}, + {file = "deptry-0.16.2-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fed4b692f556e4c80acb42cec93e3b5fdc7fc2323049c2a0cfd9dfc4a9c7033e"}, + {file = "deptry-0.16.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:93ec508a932d8f06c3bd1aa7a4548d5dbec92c3060d42eedcda3be9729bd7c3b"}, + {file = "deptry-0.16.2-cp38-abi3-win_amd64.whl", hash = "sha256:eb92e9aacde66cfe001d6318eb0851ae0ca26fea441defed4765a47644daf8bb"}, + {file = "deptry-0.16.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:dfdceca2fbc87f4bce04df4207914a5eb37e67fb2107579ad2e88107c22d2456"}, + {file = "deptry-0.16.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:96ab62dd5f4658735aac72d0e49f6d896eabf50a0e4e2cdecb436a1362aa696b"}, + {file = "deptry-0.16.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7e4408fa5a8d146b55bc40f0829fb875efef33174a2679bd9954ce988b9bc0d7"}, + {file = "deptry-0.16.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:af976afc2a0583f48dc25f616d2566fecd7af5080675c8eccb161def88d93503"}, + {file = "deptry-0.16.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:dd86c9d34aa75b91fb72b34110f0660b2277bf9a95fe9cae3ead36d465bc44ac"}, + {file = "deptry-0.16.2.tar.gz", hash = "sha256:f0f752cf6f5e9f7445a79fcf195b772cd2d4b889cd260e23867dd8013caa74c1"}, ] [package.dependencies] @@ -1099,42 +1100,42 @@ test = ["pytest (>=6)"] [[package]] name = "fastavro" -version = "1.9.4" +version = "1.9.5" description = "Fast read/write of AVRO files" optional = false python-versions = ">=3.8" files = [ - {file = "fastavro-1.9.4-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:60cb38f07462a7fb4e4440ed0de67d3d400ae6b3d780f81327bebde9aa55faef"}, - {file = "fastavro-1.9.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:063d01d197fc929c20adc09ca9f0ca86d33ac25ee0963ce0b438244eee8315ae"}, - {file = "fastavro-1.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87a9053fcfbc895f2a16a4303af22077e3a8fdcf1cd5d6ed47ff2ef22cbba2f0"}, - {file = "fastavro-1.9.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:02bf1276b7326397314adf41b34a4890f6ffa59cf7e0eb20b9e4ab0a143a1598"}, - {file = "fastavro-1.9.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:56bed9eca435389a8861e6e2d631ec7f8f5dda5b23f93517ac710665bd34ca29"}, - {file = "fastavro-1.9.4-cp310-cp310-win_amd64.whl", hash = "sha256:0cd2099c8c672b853e0b20c13e9b62a69d3fbf67ee7c59c7271ba5df1680310d"}, - {file = "fastavro-1.9.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:af8c6d8c43a02b5569c093fc5467469541ac408c79c36a5b0900d3dd0b3ba838"}, - {file = "fastavro-1.9.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e4a138710bd61580324d23bc5e3df01f0b82aee0a76404d5dddae73d9e4c723f"}, - {file = "fastavro-1.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:903d97418120ca6b6a7f38a731166c1ccc2c4344ee5e0470d09eb1dc3687540a"}, - {file = "fastavro-1.9.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:c443eeb99899d062dbf78c525e4614dd77e041a7688fa2710c224f4033f193ae"}, - {file = "fastavro-1.9.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:ac26ab0774d1b2b7af6d8f4300ad20bbc4b5469e658a02931ad13ce23635152f"}, - {file = "fastavro-1.9.4-cp311-cp311-win_amd64.whl", hash = "sha256:cf7247874c22be856ba7d1f46a0f6e0379a6025f1a48a7da640444cbac6f570b"}, - {file = "fastavro-1.9.4-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:68912f2020e1b3d70557260b27dd85fb49a4fc6bfab18d384926127452c1da4c"}, - {file = "fastavro-1.9.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6925ce137cdd78e109abdb0bc33aad55de6c9f2d2d3036b65453128f2f5f5b92"}, - {file = "fastavro-1.9.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8b928cd294e36e35516d0deb9e104b45be922ba06940794260a4e5dbed6c192a"}, - {file = "fastavro-1.9.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:90c9838bc4c991ffff5dd9d88a0cc0030f938b3fdf038cdf6babde144b920246"}, - {file = "fastavro-1.9.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:eca6e54da571b06a3c5a72dbb7212073f56c92a6fbfbf847b91c347510f8a426"}, - {file = "fastavro-1.9.4-cp312-cp312-win_amd64.whl", hash = "sha256:a4b02839ac261100cefca2e2ad04cdfedc556cb66b5ec735e0db428e74b399de"}, - {file = "fastavro-1.9.4-cp38-cp38-macosx_11_0_x86_64.whl", hash = "sha256:4451ee9a305a73313a1558d471299f3130e4ecc10a88bf5742aa03fb37e042e6"}, - {file = "fastavro-1.9.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a8524fccfb379565568c045d29b2ebf71e1f2c0dd484aeda9fe784ef5febe1a8"}, - {file = "fastavro-1.9.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:33d0a00a6e09baa20f6f038d7a2ddcb7eef0e7a9980e947a018300cb047091b8"}, - {file = "fastavro-1.9.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:23d7e5b29c9bf6f26e8be754b2c8b919838e506f78ef724de7d22881696712fc"}, - {file = "fastavro-1.9.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:2e6ab3ee53944326460edf1125b2ad5be2fadd80f7211b13c45fa0c503b4cf8d"}, - {file = "fastavro-1.9.4-cp38-cp38-win_amd64.whl", hash = "sha256:64d335ec2004204c501f8697c385d0a8f6b521ac82d5b30696f789ff5bc85f3c"}, - {file = "fastavro-1.9.4-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:7e05f44c493e89e73833bd3ff3790538726906d2856f59adc8103539f4a1b232"}, - {file = "fastavro-1.9.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:253c63993250bff4ee7b11fb46cf3a4622180a783bedc82a24c6fdcd1b10ca2a"}, - {file = "fastavro-1.9.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:24d6942eb1db14640c2581e0ecd1bbe0afc8a83731fcd3064ae7f429d7880cb7"}, - {file = "fastavro-1.9.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d47bb66be6091cd48cfe026adcad11c8b11d7d815a2949a1e4ccf03df981ca65"}, - {file = "fastavro-1.9.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:c293897f12f910e58a1024f9c77f565aa8e23b36aafda6ad8e7041accc57a57f"}, - {file = "fastavro-1.9.4-cp39-cp39-win_amd64.whl", hash = "sha256:f05d2afcb10a92e2a9e580a3891f090589b3e567fdc5641f8a46a0b084f120c3"}, - {file = "fastavro-1.9.4.tar.gz", hash = "sha256:56b8363e360a1256c94562393dc7f8611f3baf2b3159f64fb2b9c6b87b14e876"}, + {file = "fastavro-1.9.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:61253148e95dd2b6457247b441b7555074a55de17aef85f5165bfd5facf600fc"}, + {file = "fastavro-1.9.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b604935d671ad47d888efc92a106f98e9440874108b444ac10e28d643109c937"}, + {file = "fastavro-1.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0adbf4956fd53bd74c41e7855bb45ccce953e0eb0e44f5836d8d54ad843f9944"}, + {file = "fastavro-1.9.5-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:53d838e31457db8bf44460c244543f75ed307935d5fc1d93bc631cc7caef2082"}, + {file = "fastavro-1.9.5-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:07b6288e8681eede16ff077632c47395d4925c2f51545cd7a60f194454db2211"}, + {file = "fastavro-1.9.5-cp310-cp310-win_amd64.whl", hash = "sha256:ef08cf247fdfd61286ac0c41854f7194f2ad05088066a756423d7299b688d975"}, + {file = "fastavro-1.9.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c52d7bb69f617c90935a3e56feb2c34d4276819a5c477c466c6c08c224a10409"}, + {file = "fastavro-1.9.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:85e05969956003df8fa4491614bc62fe40cec59e94d06e8aaa8d8256ee3aab82"}, + {file = "fastavro-1.9.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:06e6df8527493a9f0d9a8778df82bab8b1aa6d80d1b004e5aec0a31dc4dc501c"}, + {file = "fastavro-1.9.5-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:27820da3b17bc01cebb6d1687c9d7254b16d149ef458871aaa207ed8950f3ae6"}, + {file = "fastavro-1.9.5-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:195a5b8e33eb89a1a9b63fa9dce7a77d41b3b0cd785bac6044df619f120361a2"}, + {file = "fastavro-1.9.5-cp311-cp311-win_amd64.whl", hash = "sha256:be612c109efb727bfd36d4d7ed28eb8e0506617b7dbe746463ebbf81e85eaa6b"}, + {file = "fastavro-1.9.5-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:b133456c8975ec7d2a99e16a7e68e896e45c821b852675eac4ee25364b999c14"}, + {file = "fastavro-1.9.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf586373c3d1748cac849395aad70c198ee39295f92e7c22c75757b5c0300fbe"}, + {file = "fastavro-1.9.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:724ef192bc9c55d5b4c7df007f56a46a21809463499856349d4580a55e2b914c"}, + {file = "fastavro-1.9.5-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:bfd11fe355a8f9c0416803afac298960eb4c603a23b1c74ff9c1d3e673ea7185"}, + {file = "fastavro-1.9.5-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:9827d1654d7bcb118ef5efd3e5b2c9ab2a48d44dac5e8c6a2327bc3ac3caa828"}, + {file = "fastavro-1.9.5-cp312-cp312-win_amd64.whl", hash = "sha256:d84b69dca296667e6137ae7c9a96d060123adbc0c00532cc47012b64d38b47e9"}, + {file = "fastavro-1.9.5-cp38-cp38-macosx_11_0_universal2.whl", hash = "sha256:fb744e9de40fb1dc75354098c8db7da7636cba50a40f7bef3b3fb20f8d189d88"}, + {file = "fastavro-1.9.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:240df8bacd13ff5487f2465604c007d686a566df5cbc01d0550684eaf8ff014a"}, + {file = "fastavro-1.9.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c3bb35c25bbc3904e1c02333bc1ae0173e0a44aa37a8e95d07e681601246e1f1"}, + {file = "fastavro-1.9.5-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:b47a54a9700de3eabefd36dabfb237808acae47bc873cada6be6990ef6b165aa"}, + {file = "fastavro-1.9.5-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:48c7b5e6d2f3bf7917af301c275b05c5be3dd40bb04e80979c9e7a2ab31a00d1"}, + {file = "fastavro-1.9.5-cp38-cp38-win_amd64.whl", hash = "sha256:05d13f98d4e325be40387e27da9bd60239968862fe12769258225c62ec906f04"}, + {file = "fastavro-1.9.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5b47948eb196263f6111bf34e1cd08d55529d4ed46eb50c1bc8c7c30a8d18868"}, + {file = "fastavro-1.9.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:85b7a66ad521298ad9373dfe1897a6ccfc38feab54a47b97922e213ae5ad8870"}, + {file = "fastavro-1.9.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:44cb154f863ad80e41aea72a709b12e1533b8728c89b9b1348af91a6154ab2f5"}, + {file = "fastavro-1.9.5-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:b5f7f2b1fe21231fd01f1a2a90e714ae267fe633cd7ce930c0aea33d1c9f4901"}, + {file = "fastavro-1.9.5-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:88fbbe16c61d90a89d78baeb5a34dc1c63a27b115adccdbd6b1fb6f787deacf2"}, + {file = "fastavro-1.9.5-cp39-cp39-win_amd64.whl", hash = "sha256:753f5eedeb5ca86004e23a9ce9b41c5f25eb64a876f95edcc33558090a7f3e4b"}, + {file = "fastavro-1.9.5.tar.gz", hash = "sha256:6419ebf45f88132a9945c51fe555d4f10bb97c236288ed01894f957c6f914553"}, ] [package.extras] @@ -2212,13 +2213,13 @@ test = ["mypy (>=1.0)", "pytest (>=7.0.0)"] [[package]] name = "moto" -version = "5.0.10" +version = "5.0.11" description = "" optional = false python-versions = ">=3.8" files = [ - {file = "moto-5.0.10-py2.py3-none-any.whl", hash = "sha256:9ffae2f64cc8fe95b9a12d63ae7268a7d6bea9993b922905b5abd8197d852cd0"}, - {file = "moto-5.0.10.tar.gz", hash = "sha256:eff37363221c93ea44f95721ae0ddb56f977fe70437a041b6cc641ee90266279"}, + {file = "moto-5.0.11-py2.py3-none-any.whl", hash = "sha256:bdba9bec0afcde9f99b58c5271d6458dbfcda0a0a1e9beaecd808d2591db65ea"}, + {file = "moto-5.0.11.tar.gz", hash = "sha256:606b641f4c6ef69f28a84147d6d6806d052011e7ae7b0fe46ae8858e7a27a0a3"}, ] [package.dependencies] @@ -2490,13 +2491,13 @@ files = [ [[package]] name = "mypy-boto3-glue" -version = "1.34.136" -description = "Type annotations for boto3.Glue 1.34.136 service generated with mypy-boto3-builder 7.24.0" +version = "1.34.143" +description = "Type annotations for boto3.Glue 1.34.143 service generated with mypy-boto3-builder 7.25.0" optional = true python-versions = ">=3.8" files = [ - {file = "mypy_boto3_glue-1.34.136-py3-none-any.whl", hash = "sha256:ec86ac01b8d821718f07597bd3dea975f21ea7178b283783077f623597b1b081"}, - {file = "mypy_boto3_glue-1.34.136.tar.gz", hash = "sha256:93cb4d37e2dc79b3325d93bed6e5af739ea8b6bb093974bf37ff768f186821f8"}, + {file = "mypy_boto3_glue-1.34.143-py3-none-any.whl", hash = "sha256:50b620ac58a4b2c0d66966f1e72e2aa53ec703a0e56751a93f0d48710e3a7d6d"}, + {file = "mypy_boto3_glue-1.34.143.tar.gz", hash = "sha256:7f98835c255a50cea9ef892b03582ecd19754e6bba5827356c90453af1e3b0f5"}, ] [package.dependencies] @@ -2670,8 +2671,8 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.23.2", markers = "python_version >= \"3.11\""}, {version = ">=1.20.3", markers = "python_version < \"3.10\""}, + {version = ">=1.23.2", markers = "python_version >= \"3.11\""}, {version = ">=1.21.0", markers = "python_version >= \"3.10\" and python_version < \"3.11\""}, ] python-dateutil = ">=2.8.2" @@ -3037,8 +3038,8 @@ files = [ annotated-types = ">=0.4.0" pydantic-core = "2.20.1" typing-extensions = [ - {version = ">=4.12.2", markers = "python_version >= \"3.13\""}, {version = ">=4.6.1", markers = "python_version < \"3.13\""}, + {version = ">=4.12.2", markers = "python_version >= \"3.13\""}, ] [package.extras] @@ -4032,13 +4033,13 @@ mpmath = ">=0.19" [[package]] name = "tenacity" -version = "8.4.2" +version = "8.5.0" description = "Retry code until it succeeds" optional = false python-versions = ">=3.8" files = [ - {file = "tenacity-8.4.2-py3-none-any.whl", hash = "sha256:9e6f7cf7da729125c7437222f8a522279751cdfbe6b67bfe64f75d3a348661b2"}, - {file = "tenacity-8.4.2.tar.gz", hash = "sha256:cd80a53a79336edba8489e767f729e4f391c896956b57140b5d7511a64bbd3ef"}, + {file = "tenacity-8.5.0-py3-none-any.whl", hash = "sha256:b594c2a5945830c267ce6b79a166228323ed52718f30302c1359836112346687"}, + {file = "tenacity-8.5.0.tar.gz", hash = "sha256:8bc6c0c8a09b31e6cad13c47afbed1a567518250a9a171418582ed8d9c20ca78"}, ] [package.extras] @@ -4364,72 +4365,123 @@ multidict = ">=4.0" [[package]] name = "zipp" -version = "3.17.0" +version = "3.19.1" description = "Backport of pathlib-compatible object wrapper for zip files" optional = false python-versions = ">=3.8" files = [ - {file = "zipp-3.17.0-py3-none-any.whl", hash = "sha256:0e923e726174922dce09c53c59ad483ff7bbb8e572e00c7f7c46b88556409f31"}, - {file = "zipp-3.17.0.tar.gz", hash = "sha256:84e64a1c28cf7e91ed2078bb8cc8c259cb19b76942096c8d7b84947690cabaf0"}, + {file = "zipp-3.19.1-py3-none-any.whl", hash = "sha256:2828e64edb5386ea6a52e7ba7cdb17bb30a73a858f5eb6eb93d8d36f5ea26091"}, + {file = "zipp-3.19.1.tar.gz", hash = "sha256:35427f6d5594f4acf82d25541438348c26736fa9b3afa2754bcd63cdb99d8e8f"}, ] [package.extras] -docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-lint"] -testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy (>=0.9.1)", "pytest-ruff"] +doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] +test = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy", "pytest-ruff (>=0.2.1)"] [[package]] name = "zstandard" -version = "0.22.0" +version = "0.23.0" description = "Zstandard bindings for Python" optional = false python-versions = ">=3.8" files = [ - {file = "zstandard-0.22.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:275df437ab03f8c033b8a2c181e51716c32d831082d93ce48002a5227ec93019"}, - {file = "zstandard-0.22.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2ac9957bc6d2403c4772c890916bf181b2653640da98f32e04b96e4d6fb3252a"}, - {file = "zstandard-0.22.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fe3390c538f12437b859d815040763abc728955a52ca6ff9c5d4ac707c4ad98e"}, - {file = "zstandard-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1958100b8a1cc3f27fa21071a55cb2ed32e9e5df4c3c6e661c193437f171cba2"}, - {file = "zstandard-0.22.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:93e1856c8313bc688d5df069e106a4bc962eef3d13372020cc6e3ebf5e045202"}, - {file = "zstandard-0.22.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:1a90ba9a4c9c884bb876a14be2b1d216609385efb180393df40e5172e7ecf356"}, - {file = "zstandard-0.22.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:3db41c5e49ef73641d5111554e1d1d3af106410a6c1fb52cf68912ba7a343a0d"}, - {file = "zstandard-0.22.0-cp310-cp310-win32.whl", hash = "sha256:d8593f8464fb64d58e8cb0b905b272d40184eac9a18d83cf8c10749c3eafcd7e"}, - {file = "zstandard-0.22.0-cp310-cp310-win_amd64.whl", hash = "sha256:f1a4b358947a65b94e2501ce3e078bbc929b039ede4679ddb0460829b12f7375"}, - {file = "zstandard-0.22.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:589402548251056878d2e7c8859286eb91bd841af117dbe4ab000e6450987e08"}, - {file = "zstandard-0.22.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a97079b955b00b732c6f280d5023e0eefe359045e8b83b08cf0333af9ec78f26"}, - {file = "zstandard-0.22.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:445b47bc32de69d990ad0f34da0e20f535914623d1e506e74d6bc5c9dc40bb09"}, - {file = "zstandard-0.22.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:33591d59f4956c9812f8063eff2e2c0065bc02050837f152574069f5f9f17775"}, - {file = "zstandard-0.22.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:888196c9c8893a1e8ff5e89b8f894e7f4f0e64a5af4d8f3c410f0319128bb2f8"}, - {file = "zstandard-0.22.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:53866a9d8ab363271c9e80c7c2e9441814961d47f88c9bc3b248142c32141d94"}, - {file = "zstandard-0.22.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:4ac59d5d6910b220141c1737b79d4a5aa9e57466e7469a012ed42ce2d3995e88"}, - {file = "zstandard-0.22.0-cp311-cp311-win32.whl", hash = "sha256:2b11ea433db22e720758cba584c9d661077121fcf60ab43351950ded20283440"}, - {file = "zstandard-0.22.0-cp311-cp311-win_amd64.whl", hash = "sha256:11f0d1aab9516a497137b41e3d3ed4bbf7b2ee2abc79e5c8b010ad286d7464bd"}, - {file = "zstandard-0.22.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:6c25b8eb733d4e741246151d895dd0308137532737f337411160ff69ca24f93a"}, - {file = "zstandard-0.22.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f9b2cde1cd1b2a10246dbc143ba49d942d14fb3d2b4bccf4618d475c65464912"}, - {file = "zstandard-0.22.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a88b7df61a292603e7cd662d92565d915796b094ffb3d206579aaebac6b85d5f"}, - {file = "zstandard-0.22.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:466e6ad8caefb589ed281c076deb6f0cd330e8bc13c5035854ffb9c2014b118c"}, - {file = "zstandard-0.22.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a1d67d0d53d2a138f9e29d8acdabe11310c185e36f0a848efa104d4e40b808e4"}, - {file = "zstandard-0.22.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:39b2853efc9403927f9065cc48c9980649462acbdf81cd4f0cb773af2fd734bc"}, - {file = "zstandard-0.22.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8a1b2effa96a5f019e72874969394edd393e2fbd6414a8208fea363a22803b45"}, - {file = "zstandard-0.22.0-cp312-cp312-win32.whl", hash = "sha256:88c5b4b47a8a138338a07fc94e2ba3b1535f69247670abfe422de4e0b344aae2"}, - {file = "zstandard-0.22.0-cp312-cp312-win_amd64.whl", hash = "sha256:de20a212ef3d00d609d0b22eb7cc798d5a69035e81839f549b538eff4105d01c"}, - {file = "zstandard-0.22.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d75f693bb4e92c335e0645e8845e553cd09dc91616412d1d4650da835b5449df"}, - {file = "zstandard-0.22.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:36a47636c3de227cd765e25a21dc5dace00539b82ddd99ee36abae38178eff9e"}, - {file = "zstandard-0.22.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:68953dc84b244b053c0d5f137a21ae8287ecf51b20872eccf8eaac0302d3e3b0"}, - {file = "zstandard-0.22.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2612e9bb4977381184bb2463150336d0f7e014d6bb5d4a370f9a372d21916f69"}, - {file = "zstandard-0.22.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:23d2b3c2b8e7e5a6cb7922f7c27d73a9a615f0a5ab5d0e03dd533c477de23004"}, - {file = "zstandard-0.22.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:1d43501f5f31e22baf822720d82b5547f8a08f5386a883b32584a185675c8fbf"}, - {file = "zstandard-0.22.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:a493d470183ee620a3df1e6e55b3e4de8143c0ba1b16f3ded83208ea8ddfd91d"}, - {file = "zstandard-0.22.0-cp38-cp38-win32.whl", hash = "sha256:7034d381789f45576ec3f1fa0e15d741828146439228dc3f7c59856c5bcd3292"}, - {file = "zstandard-0.22.0-cp38-cp38-win_amd64.whl", hash = "sha256:d8fff0f0c1d8bc5d866762ae95bd99d53282337af1be9dc0d88506b340e74b73"}, - {file = "zstandard-0.22.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2fdd53b806786bd6112d97c1f1e7841e5e4daa06810ab4b284026a1a0e484c0b"}, - {file = "zstandard-0.22.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:73a1d6bd01961e9fd447162e137ed949c01bdb830dfca487c4a14e9742dccc93"}, - {file = "zstandard-0.22.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9501f36fac6b875c124243a379267d879262480bf85b1dbda61f5ad4d01b75a3"}, - {file = "zstandard-0.22.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:48f260e4c7294ef275744210a4010f116048e0c95857befb7462e033f09442fe"}, - {file = "zstandard-0.22.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:959665072bd60f45c5b6b5d711f15bdefc9849dd5da9fb6c873e35f5d34d8cfb"}, - {file = "zstandard-0.22.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d22fdef58976457c65e2796e6730a3ea4a254f3ba83777ecfc8592ff8d77d303"}, - {file = "zstandard-0.22.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a7ccf5825fd71d4542c8ab28d4d482aace885f5ebe4b40faaa290eed8e095a4c"}, - {file = "zstandard-0.22.0-cp39-cp39-win32.whl", hash = "sha256:f058a77ef0ece4e210bb0450e68408d4223f728b109764676e1a13537d056bb0"}, - {file = "zstandard-0.22.0-cp39-cp39-win_amd64.whl", hash = "sha256:e9e9d4e2e336c529d4c435baad846a181e39a982f823f7e4495ec0b0ec8538d2"}, - {file = "zstandard-0.22.0.tar.gz", hash = "sha256:8226a33c542bcb54cd6bd0a366067b610b41713b64c9abec1bc4533d69f51e70"}, + {file = "zstandard-0.23.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bf0a05b6059c0528477fba9054d09179beb63744355cab9f38059548fedd46a9"}, + {file = "zstandard-0.23.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fc9ca1c9718cb3b06634c7c8dec57d24e9438b2aa9a0f02b8bb36bf478538880"}, + {file = "zstandard-0.23.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77da4c6bfa20dd5ea25cbf12c76f181a8e8cd7ea231c673828d0386b1740b8dc"}, + {file = "zstandard-0.23.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b2170c7e0367dde86a2647ed5b6f57394ea7f53545746104c6b09fc1f4223573"}, + {file = "zstandard-0.23.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c16842b846a8d2a145223f520b7e18b57c8f476924bda92aeee3a88d11cfc391"}, + {file = "zstandard-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:157e89ceb4054029a289fb504c98c6a9fe8010f1680de0201b3eb5dc20aa6d9e"}, + {file = "zstandard-0.23.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:203d236f4c94cd8379d1ea61db2fce20730b4c38d7f1c34506a31b34edc87bdd"}, + {file = "zstandard-0.23.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:dc5d1a49d3f8262be192589a4b72f0d03b72dcf46c51ad5852a4fdc67be7b9e4"}, + {file = "zstandard-0.23.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:752bf8a74412b9892f4e5b58f2f890a039f57037f52c89a740757ebd807f33ea"}, + {file = "zstandard-0.23.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:80080816b4f52a9d886e67f1f96912891074903238fe54f2de8b786f86baded2"}, + {file = "zstandard-0.23.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:84433dddea68571a6d6bd4fbf8ff398236031149116a7fff6f777ff95cad3df9"}, + {file = "zstandard-0.23.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:ab19a2d91963ed9e42b4e8d77cd847ae8381576585bad79dbd0a8837a9f6620a"}, + {file = "zstandard-0.23.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:59556bf80a7094d0cfb9f5e50bb2db27fefb75d5138bb16fb052b61b0e0eeeb0"}, + {file = "zstandard-0.23.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:27d3ef2252d2e62476389ca8f9b0cf2bbafb082a3b6bfe9d90cbcbb5529ecf7c"}, + {file = "zstandard-0.23.0-cp310-cp310-win32.whl", hash = "sha256:5d41d5e025f1e0bccae4928981e71b2334c60f580bdc8345f824e7c0a4c2a813"}, + {file = "zstandard-0.23.0-cp310-cp310-win_amd64.whl", hash = "sha256:519fbf169dfac1222a76ba8861ef4ac7f0530c35dd79ba5727014613f91613d4"}, + {file = "zstandard-0.23.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:34895a41273ad33347b2fc70e1bff4240556de3c46c6ea430a7ed91f9042aa4e"}, + {file = "zstandard-0.23.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:77ea385f7dd5b5676d7fd943292ffa18fbf5c72ba98f7d09fc1fb9e819b34c23"}, + {file = "zstandard-0.23.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:983b6efd649723474f29ed42e1467f90a35a74793437d0bc64a5bf482bedfa0a"}, + {file = "zstandard-0.23.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:80a539906390591dd39ebb8d773771dc4db82ace6372c4d41e2d293f8e32b8db"}, + {file = "zstandard-0.23.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:445e4cb5048b04e90ce96a79b4b63140e3f4ab5f662321975679b5f6360b90e2"}, + {file = "zstandard-0.23.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd30d9c67d13d891f2360b2a120186729c111238ac63b43dbd37a5a40670b8ca"}, + {file = "zstandard-0.23.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d20fd853fbb5807c8e84c136c278827b6167ded66c72ec6f9a14b863d809211c"}, + {file = "zstandard-0.23.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ed1708dbf4d2e3a1c5c69110ba2b4eb6678262028afd6c6fbcc5a8dac9cda68e"}, + {file = "zstandard-0.23.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:be9b5b8659dff1f913039c2feee1aca499cfbc19e98fa12bc85e037c17ec6ca5"}, + {file = "zstandard-0.23.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:65308f4b4890aa12d9b6ad9f2844b7ee42c7f7a4fd3390425b242ffc57498f48"}, + {file = "zstandard-0.23.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:98da17ce9cbf3bfe4617e836d561e433f871129e3a7ac16d6ef4c680f13a839c"}, + {file = "zstandard-0.23.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:8ed7d27cb56b3e058d3cf684d7200703bcae623e1dcc06ed1e18ecda39fee003"}, + {file = "zstandard-0.23.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:b69bb4f51daf461b15e7b3db033160937d3ff88303a7bc808c67bbc1eaf98c78"}, + {file = "zstandard-0.23.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:034b88913ecc1b097f528e42b539453fa82c3557e414b3de9d5632c80439a473"}, + {file = "zstandard-0.23.0-cp311-cp311-win32.whl", hash = "sha256:f2d4380bf5f62daabd7b751ea2339c1a21d1c9463f1feb7fc2bdcea2c29c3160"}, + {file = "zstandard-0.23.0-cp311-cp311-win_amd64.whl", hash = "sha256:62136da96a973bd2557f06ddd4e8e807f9e13cbb0bfb9cc06cfe6d98ea90dfe0"}, + {file = "zstandard-0.23.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b4567955a6bc1b20e9c31612e615af6b53733491aeaa19a6b3b37f3b65477094"}, + {file = "zstandard-0.23.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1e172f57cd78c20f13a3415cc8dfe24bf388614324d25539146594c16d78fcc8"}, + {file = "zstandard-0.23.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b0e166f698c5a3e914947388c162be2583e0c638a4703fc6a543e23a88dea3c1"}, + {file = "zstandard-0.23.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:12a289832e520c6bd4dcaad68e944b86da3bad0d339ef7989fb7e88f92e96072"}, + {file = "zstandard-0.23.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d50d31bfedd53a928fed6707b15a8dbeef011bb6366297cc435accc888b27c20"}, + {file = "zstandard-0.23.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:72c68dda124a1a138340fb62fa21b9bf4848437d9ca60bd35db36f2d3345f373"}, + {file = "zstandard-0.23.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:53dd9d5e3d29f95acd5de6802e909ada8d8d8cfa37a3ac64836f3bc4bc5512db"}, + {file = "zstandard-0.23.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:6a41c120c3dbc0d81a8e8adc73312d668cd34acd7725f036992b1b72d22c1772"}, + {file = "zstandard-0.23.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:40b33d93c6eddf02d2c19f5773196068d875c41ca25730e8288e9b672897c105"}, + {file = "zstandard-0.23.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:9206649ec587e6b02bd124fb7799b86cddec350f6f6c14bc82a2b70183e708ba"}, + {file = "zstandard-0.23.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:76e79bc28a65f467e0409098fa2c4376931fd3207fbeb6b956c7c476d53746dd"}, + {file = "zstandard-0.23.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:66b689c107857eceabf2cf3d3fc699c3c0fe8ccd18df2219d978c0283e4c508a"}, + {file = "zstandard-0.23.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:9c236e635582742fee16603042553d276cca506e824fa2e6489db04039521e90"}, + {file = "zstandard-0.23.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:a8fffdbd9d1408006baaf02f1068d7dd1f016c6bcb7538682622c556e7b68e35"}, + {file = "zstandard-0.23.0-cp312-cp312-win32.whl", hash = "sha256:dc1d33abb8a0d754ea4763bad944fd965d3d95b5baef6b121c0c9013eaf1907d"}, + {file = "zstandard-0.23.0-cp312-cp312-win_amd64.whl", hash = "sha256:64585e1dba664dc67c7cdabd56c1e5685233fbb1fc1966cfba2a340ec0dfff7b"}, + {file = "zstandard-0.23.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:576856e8594e6649aee06ddbfc738fec6a834f7c85bf7cadd1c53d4a58186ef9"}, + {file = "zstandard-0.23.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:38302b78a850ff82656beaddeb0bb989a0322a8bbb1bf1ab10c17506681d772a"}, + {file = "zstandard-0.23.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d2240ddc86b74966c34554c49d00eaafa8200a18d3a5b6ffbf7da63b11d74ee2"}, + {file = "zstandard-0.23.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2ef230a8fd217a2015bc91b74f6b3b7d6522ba48be29ad4ea0ca3a3775bf7dd5"}, + {file = "zstandard-0.23.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:774d45b1fac1461f48698a9d4b5fa19a69d47ece02fa469825b442263f04021f"}, + {file = "zstandard-0.23.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f77fa49079891a4aab203d0b1744acc85577ed16d767b52fc089d83faf8d8ed"}, + {file = "zstandard-0.23.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ac184f87ff521f4840e6ea0b10c0ec90c6b1dcd0bad2f1e4a9a1b4fa177982ea"}, + {file = "zstandard-0.23.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:c363b53e257246a954ebc7c488304b5592b9c53fbe74d03bc1c64dda153fb847"}, + {file = "zstandard-0.23.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:e7792606d606c8df5277c32ccb58f29b9b8603bf83b48639b7aedf6df4fe8171"}, + {file = "zstandard-0.23.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a0817825b900fcd43ac5d05b8b3079937073d2b1ff9cf89427590718b70dd840"}, + {file = "zstandard-0.23.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:9da6bc32faac9a293ddfdcb9108d4b20416219461e4ec64dfea8383cac186690"}, + {file = "zstandard-0.23.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:fd7699e8fd9969f455ef2926221e0233f81a2542921471382e77a9e2f2b57f4b"}, + {file = "zstandard-0.23.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:d477ed829077cd945b01fc3115edd132c47e6540ddcd96ca169facff28173057"}, + {file = "zstandard-0.23.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:fa6ce8b52c5987b3e34d5674b0ab529a4602b632ebab0a93b07bfb4dfc8f8a33"}, + {file = "zstandard-0.23.0-cp313-cp313-win32.whl", hash = "sha256:a9b07268d0c3ca5c170a385a0ab9fb7fdd9f5fd866be004c4ea39e44edce47dd"}, + {file = "zstandard-0.23.0-cp313-cp313-win_amd64.whl", hash = "sha256:f3513916e8c645d0610815c257cbfd3242adfd5c4cfa78be514e5a3ebb42a41b"}, + {file = "zstandard-0.23.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2ef3775758346d9ac6214123887d25c7061c92afe1f2b354f9388e9e4d48acfc"}, + {file = "zstandard-0.23.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4051e406288b8cdbb993798b9a45c59a4896b6ecee2f875424ec10276a895740"}, + {file = "zstandard-0.23.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e2d1a054f8f0a191004675755448d12be47fa9bebbcffa3cdf01db19f2d30a54"}, + {file = "zstandard-0.23.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f83fa6cae3fff8e98691248c9320356971b59678a17f20656a9e59cd32cee6d8"}, + {file = "zstandard-0.23.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:32ba3b5ccde2d581b1e6aa952c836a6291e8435d788f656fe5976445865ae045"}, + {file = "zstandard-0.23.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2f146f50723defec2975fb7e388ae3a024eb7151542d1599527ec2aa9cacb152"}, + {file = "zstandard-0.23.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1bfe8de1da6d104f15a60d4a8a768288f66aa953bbe00d027398b93fb9680b26"}, + {file = "zstandard-0.23.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:29a2bc7c1b09b0af938b7a8343174b987ae021705acabcbae560166567f5a8db"}, + {file = "zstandard-0.23.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:61f89436cbfede4bc4e91b4397eaa3e2108ebe96d05e93d6ccc95ab5714be512"}, + {file = "zstandard-0.23.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:53ea7cdc96c6eb56e76bb06894bcfb5dfa93b7adcf59d61c6b92674e24e2dd5e"}, + {file = "zstandard-0.23.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:a4ae99c57668ca1e78597d8b06d5af837f377f340f4cce993b551b2d7731778d"}, + {file = "zstandard-0.23.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:379b378ae694ba78cef921581ebd420c938936a153ded602c4fea612b7eaa90d"}, + {file = "zstandard-0.23.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:50a80baba0285386f97ea36239855f6020ce452456605f262b2d33ac35c7770b"}, + {file = "zstandard-0.23.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:61062387ad820c654b6a6b5f0b94484fa19515e0c5116faf29f41a6bc91ded6e"}, + {file = "zstandard-0.23.0-cp38-cp38-win32.whl", hash = "sha256:b8c0bd73aeac689beacd4e7667d48c299f61b959475cdbb91e7d3d88d27c56b9"}, + {file = "zstandard-0.23.0-cp38-cp38-win_amd64.whl", hash = "sha256:a05e6d6218461eb1b4771d973728f0133b2a4613a6779995df557f70794fd60f"}, + {file = "zstandard-0.23.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3aa014d55c3af933c1315eb4bb06dd0459661cc0b15cd61077afa6489bec63bb"}, + {file = "zstandard-0.23.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0a7f0804bb3799414af278e9ad51be25edf67f78f916e08afdb983e74161b916"}, + {file = "zstandard-0.23.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb2b1ecfef1e67897d336de3a0e3f52478182d6a47eda86cbd42504c5cbd009a"}, + {file = "zstandard-0.23.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:837bb6764be6919963ef41235fd56a6486b132ea64afe5fafb4cb279ac44f259"}, + {file = "zstandard-0.23.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1516c8c37d3a053b01c1c15b182f3b5f5eef19ced9b930b684a73bad121addf4"}, + {file = "zstandard-0.23.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:48ef6a43b1846f6025dde6ed9fee0c24e1149c1c25f7fb0a0585572b2f3adc58"}, + {file = "zstandard-0.23.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:11e3bf3c924853a2d5835b24f03eeba7fc9b07d8ca499e247e06ff5676461a15"}, + {file = "zstandard-0.23.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:2fb4535137de7e244c230e24f9d1ec194f61721c86ebea04e1581d9d06ea1269"}, + {file = "zstandard-0.23.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8c24f21fa2af4bb9f2c492a86fe0c34e6d2c63812a839590edaf177b7398f700"}, + {file = "zstandard-0.23.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:a8c86881813a78a6f4508ef9daf9d4995b8ac2d147dcb1a450448941398091c9"}, + {file = "zstandard-0.23.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:fe3b385d996ee0822fd46528d9f0443b880d4d05528fd26a9119a54ec3f91c69"}, + {file = "zstandard-0.23.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:82d17e94d735c99621bf8ebf9995f870a6b3e6d14543b99e201ae046dfe7de70"}, + {file = "zstandard-0.23.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:c7c517d74bea1a6afd39aa612fa025e6b8011982a0897768a2f7c8ab4ebb78a2"}, + {file = "zstandard-0.23.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:1fd7e0f1cfb70eb2f95a19b472ee7ad6d9a0a992ec0ae53286870c104ca939e5"}, + {file = "zstandard-0.23.0-cp39-cp39-win32.whl", hash = "sha256:43da0f0092281bf501f9c5f6f3b4c975a8a0ea82de49ba3f7100e64d422a1274"}, + {file = "zstandard-0.23.0-cp39-cp39-win_amd64.whl", hash = "sha256:f8346bfa098532bc1fb6c7ef06783e969d87a99dd1d2a5a18a892c1d7a643c58"}, + {file = "zstandard-0.23.0.tar.gz", hash = "sha256:b2d8c62d08e7255f68f7a740bae85b3c9b8e5466baa9cbf7f57f1cde0ac6bc09"}, ] [package.dependencies] @@ -4458,4 +4510,4 @@ zstandard = ["zstandard"] [metadata] lock-version = "2.0" python-versions = "^3.8" -content-hash = "6e68bbd21368ac70baa311ed9567b5ad971b134207972549b1835718f76402a6" +content-hash = "d232005b02937046823d794fc4b49fc6f96b9435178185c8e1de39c3219e8d68" diff --git a/pyiceberg/catalog/__init__.py b/pyiceberg/catalog/__init__.py index 9a951b5c8e..ae978329a0 100644 --- a/pyiceberg/catalog/__init__.py +++ b/pyiceberg/catalog/__init__.py @@ -49,6 +49,7 @@ from pyiceberg.schema import Schema from pyiceberg.serializers import ToOutputFile from pyiceberg.table import ( + DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE, CommitTableRequest, CommitTableResponse, CreateTableTransaction, @@ -675,8 +676,11 @@ def _convert_schema_if_needed(schema: Union[Schema, "pa.Schema"]) -> Schema: from pyiceberg.io.pyarrow import _ConvertToIcebergWithoutIDs, visit_pyarrow + downcast_ns_timestamp_to_us = Config().get_bool(DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE) or False if isinstance(schema, pa.Schema): - schema: Schema = visit_pyarrow(schema, _ConvertToIcebergWithoutIDs()) # type: ignore + schema: Schema = visit_pyarrow( # type: ignore + schema, _ConvertToIcebergWithoutIDs(downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us) + ) return schema except ModuleNotFoundError: pass diff --git a/pyiceberg/catalog/glue.py b/pyiceberg/catalog/glue.py index 8819c2e266..bc85b977f9 100644 --- a/pyiceberg/catalog/glue.py +++ b/pyiceberg/catalog/glue.py @@ -109,6 +109,10 @@ GLUE_SKIP_ARCHIVE = "glue.skip-archive" GLUE_SKIP_ARCHIVE_DEFAULT = True +# Configure an alternative endpoint of the Glue service for GlueCatalog to access. +# This could be used to use GlueCatalog with any glue-compatible metastore service that has a different endpoint +GLUE_CATALOG_ENDPOINT = "glue.endpoint" + ICEBERG_FIELD_ID = "iceberg.field.id" ICEBERG_FIELD_OPTIONAL = "iceberg.field.optional" ICEBERG_FIELD_CURRENT = "iceberg.field.current" @@ -289,7 +293,7 @@ def __init__(self, name: str, **properties: Any): aws_secret_access_key=properties.get("aws_secret_access_key"), aws_session_token=properties.get("aws_session_token"), ) - self.glue: GlueClient = session.client("glue") + self.glue: GlueClient = session.client("glue", endpoint_url=properties.get(GLUE_CATALOG_ENDPOINT)) if glue_catalog_id := properties.get(GLUE_ID): _register_glue_catalog_id_with_glue_client(self.glue, glue_catalog_id) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index e6490ae156..cd6736fbba 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -31,6 +31,7 @@ import logging import os import re +import uuid from abc import ABC, abstractmethod from concurrent.futures import Future from copy import copy @@ -112,13 +113,14 @@ DataFileContent, FileFormat, ) -from pyiceberg.partitioning import PartitionField, PartitionSpec, partition_record_value +from pyiceberg.partitioning import PartitionField, PartitionFieldValue, PartitionKey, PartitionSpec, partition_record_value from pyiceberg.schema import ( PartnerAccessor, PreOrderSchemaVisitor, Schema, SchemaVisitorPerPrimitiveType, SchemaWithPartnerVisitor, + _check_schema_compatible, pre_order_visit, promote, prune_columns, @@ -126,7 +128,6 @@ visit, visit_with_partner, ) -from pyiceberg.table import PropertyUtil, TableProperties, WriteTask from pyiceberg.table.metadata import TableMetadata from pyiceberg.table.name_mapping import NameMapping from pyiceberg.transforms import TruncateTransform @@ -154,12 +155,14 @@ UUIDType, ) from pyiceberg.utils.concurrent import ExecutorFactory +from pyiceberg.utils.config import Config from pyiceberg.utils.datetime import millis_to_datetime +from pyiceberg.utils.deprecated import deprecated from pyiceberg.utils.singleton import Singleton from pyiceberg.utils.truncate import truncate_upper_bound_binary_string, truncate_upper_bound_text_string if TYPE_CHECKING: - from pyiceberg.table import FileScanTask + from pyiceberg.table import FileScanTask, WriteTask logger = logging.getLogger(__name__) @@ -173,6 +176,7 @@ MAP_KEY_NAME = "key" MAP_VALUE_NAME = "value" DOC = "doc" +UTC_ALIASES = {"UTC", "+00:00", "Etc/UTC", "Z"} T = TypeVar("T") @@ -470,7 +474,9 @@ def __setstate__(self, state: Dict[str, Any]) -> None: def schema_to_pyarrow( - schema: Union[Schema, IcebergType], metadata: Dict[bytes, bytes] = EMPTY_DICT, include_field_ids: bool = True + schema: Union[Schema, IcebergType], + metadata: Dict[bytes, bytes] = EMPTY_DICT, + include_field_ids: bool = True, ) -> pa.schema: return visit(schema, _ConvertToArrowSchema(metadata, include_field_ids)) @@ -663,12 +669,14 @@ def _combine_positional_deletes(positional_deletes: List[pa.ChunkedArray], start return np.subtract(np.setdiff1d(np.arange(start_index, end_index), all_chunks, assume_unique=False), start_index) -def pyarrow_to_schema(schema: pa.Schema, name_mapping: Optional[NameMapping] = None) -> Schema: +def pyarrow_to_schema( + schema: pa.Schema, name_mapping: Optional[NameMapping] = None, downcast_ns_timestamp_to_us: bool = False +) -> Schema: has_ids = visit_pyarrow(schema, _HasIds()) if has_ids: - visitor = _ConvertToIceberg() + visitor = _ConvertToIceberg(downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us) elif name_mapping is not None: - visitor = _ConvertToIceberg(name_mapping=name_mapping) + visitor = _ConvertToIceberg(name_mapping=name_mapping, downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us) else: raise ValueError( "Parquet file does not have field-ids and the Iceberg table does not have 'schema.name-mapping.default' defined" @@ -676,8 +684,8 @@ def pyarrow_to_schema(schema: pa.Schema, name_mapping: Optional[NameMapping] = N return visit_pyarrow(schema, visitor) -def _pyarrow_to_schema_without_ids(schema: pa.Schema) -> Schema: - return visit_pyarrow(schema, _ConvertToIcebergWithoutIDs()) +def _pyarrow_to_schema_without_ids(schema: pa.Schema, downcast_ns_timestamp_to_us: bool = False) -> Schema: + return visit_pyarrow(schema, _ConvertToIcebergWithoutIDs(downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us)) def _pyarrow_schema_ensure_large_types(schema: pa.Schema) -> pa.Schema: @@ -849,9 +857,10 @@ class _ConvertToIceberg(PyArrowSchemaVisitor[Union[IcebergType, Schema]]): _field_names: List[str] _name_mapping: Optional[NameMapping] - def __init__(self, name_mapping: Optional[NameMapping] = None) -> None: + def __init__(self, name_mapping: Optional[NameMapping] = None, downcast_ns_timestamp_to_us: bool = False) -> None: self._field_names = [] self._name_mapping = name_mapping + self._downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us def _field_id(self, field: pa.Field) -> int: if self._name_mapping: @@ -918,11 +927,24 @@ def primitive(self, primitive: pa.DataType) -> PrimitiveType: return TimeType() elif pa.types.is_timestamp(primitive): primitive = cast(pa.TimestampType, primitive) - if primitive.unit == "us": - if primitive.tz == "UTC" or primitive.tz == "+00:00": - return TimestamptzType() - elif primitive.tz is None: - return TimestampType() + if primitive.unit in ("s", "ms", "us"): + # Supported types, will be upcast automatically to 'us' + pass + elif primitive.unit == "ns": + if self._downcast_ns_timestamp_to_us: + logger.warning("Iceberg does not yet support 'ns' timestamp precision. Downcasting to 'us'.") + else: + raise TypeError( + "Iceberg does not yet support 'ns' timestamp precision. Use 'downcast-ns-timestamp-to-us-on-write' configuration property to automatically downcast 'ns' to 'us' on write." + ) + else: + raise TypeError(f"Unsupported precision for timestamp type: {primitive.unit}") + + if primitive.tz in UTC_ALIASES: + return TimestamptzType() + elif primitive.tz is None: + return TimestampType() + elif pa.types.is_binary(primitive) or pa.types.is_large_binary(primitive): return BinaryType() elif pa.types.is_fixed_size_binary(primitive): @@ -1010,8 +1032,11 @@ def _task_to_record_batches( with fs.open_input_file(path) as fin: fragment = arrow_format.make_fragment(fin) physical_schema = fragment.physical_schema - file_schema = pyarrow_to_schema(physical_schema, name_mapping) - + # In V1 and V2 table formats, we only support Timestamp 'us' in Iceberg Schema + # Hence it is reasonable to always cast 'ns' timestamp to 'us' on read. + # When V3 support is introduced, we will update `downcast_ns_timestamp_to_us` flag based on + # the table format version. + file_schema = pyarrow_to_schema(physical_schema, name_mapping, downcast_ns_timestamp_to_us=True) pyarrow_filter = None if bound_row_filter is not AlwaysTrue(): translated_row_filter = translate_column_names(bound_row_filter, file_schema, case_sensitive=case_sensitive) @@ -1025,8 +1050,10 @@ def _task_to_record_batches( fragment_scanner = ds.Scanner.from_fragment( fragment=fragment, - # We always use large types in memory as it uses larger offsets - # That can chunk more row values into the buffers + # With PyArrow 16.0.0 there is an issue with casting record-batches: + # https://github.com/apache/arrow/issues/41884 + # https://github.com/apache/arrow/issues/43183 + # Would be good to remove this later on schema=_pyarrow_schema_ensure_large_types(physical_schema), # This will push down the query to Arrow. # But in case there are positional deletes, we have to apply them first @@ -1049,7 +1076,7 @@ def _task_to_record_batches( arrow_table = pa.Table.from_batches([batch]) arrow_table = arrow_table.filter(pyarrow_filter) batch = arrow_table.to_batches()[0] - yield to_requested_schema(projected_schema, file_project_schema, batch) + yield _to_requested_schema(projected_schema, file_project_schema, batch, downcast_ns_timestamp_to_us=True) current_index += len(batch) @@ -1062,11 +1089,17 @@ def _task_to_table( positional_deletes: Optional[List[ChunkedArray]], case_sensitive: bool, name_mapping: Optional[NameMapping] = None, -) -> pa.Table: - batches = _task_to_record_batches( - fs, task, bound_row_filter, projected_schema, projected_field_ids, positional_deletes, case_sensitive, name_mapping +) -> Optional[pa.Table]: + batches = list( + _task_to_record_batches( + fs, task, bound_row_filter, projected_schema, projected_field_ids, positional_deletes, case_sensitive, name_mapping + ) ) - return pa.Table.from_batches(batches, schema=schema_to_pyarrow(projected_schema, include_field_ids=False)) + + if len(batches) > 0: + return pa.Table.from_batches(batches) + else: + return None def _read_all_delete_files(fs: FileSystem, tasks: Iterable[FileScanTask]) -> Dict[str, List[ChunkedArray]]: @@ -1170,7 +1203,7 @@ def project_table( if len(tables) < 1: return pa.Table.from_batches([], schema=schema_to_pyarrow(projected_schema, include_field_ids=False)) - result = pa.concat_tables(tables) + result = pa.concat_tables(tables, promote_options="permissive") if limit is not None: return result.slice(0, limit) @@ -1248,8 +1281,13 @@ def project_batches( total_row_count += len(batch) -def to_requested_schema(requested_schema: Schema, file_schema: Schema, batch: pa.RecordBatch) -> pa.RecordBatch: - struct_array = visit_with_partner(requested_schema, batch, ArrowProjectionVisitor(file_schema), ArrowAccessor(file_schema)) +@deprecated( + deprecated_in="0.7.0", + removed_in="0.8.0", + help_message="The public API for 'to_requested_schema' is deprecated and is replaced by '_to_requested_schema'", +) +def to_requested_schema(requested_schema: Schema, file_schema: Schema, table: pa.Table) -> pa.Table: + struct_array = visit_with_partner(requested_schema, table, ArrowProjectionVisitor(file_schema), ArrowAccessor(file_schema)) arrays = [] fields = [] @@ -1257,33 +1295,84 @@ def to_requested_schema(requested_schema: Schema, file_schema: Schema, batch: pa array = struct_array.field(pos) arrays.append(array) fields.append(pa.field(field.name, array.type, field.optional)) - return pa.RecordBatch.from_arrays(arrays, schema=pa.schema(fields)) + return pa.Table.from_arrays(arrays, schema=pa.schema(fields)) + + +def _to_requested_schema( + requested_schema: Schema, + file_schema: Schema, + batch: pa.RecordBatch, + downcast_ns_timestamp_to_us: bool = False, + include_field_ids: bool = False, +) -> pa.RecordBatch: + # We could re-use some of these visitors + struct_array = visit_with_partner( + requested_schema, + batch, + ArrowProjectionVisitor(file_schema, downcast_ns_timestamp_to_us, include_field_ids), + ArrowAccessor(file_schema), + ) + return pa.RecordBatch.from_struct_array(struct_array) class ArrowProjectionVisitor(SchemaWithPartnerVisitor[pa.Array, Optional[pa.Array]]): - file_schema: Schema + _file_schema: Schema + _include_field_ids: bool + _downcast_ns_timestamp_to_us: bool - def __init__(self, file_schema: Schema): - self.file_schema = file_schema + def __init__(self, file_schema: Schema, downcast_ns_timestamp_to_us: bool = False, include_field_ids: bool = False) -> None: + self._file_schema = file_schema + self._include_field_ids = include_field_ids + self._downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us def _cast_if_needed(self, field: NestedField, values: pa.Array) -> pa.Array: - file_field = self.file_schema.find_field(field.field_id) + file_field = self._file_schema.find_field(field.field_id) + if field.field_type.is_primitive: if field.field_type != file_field.field_type: - return values.cast(schema_to_pyarrow(promote(file_field.field_type, field.field_type), include_field_ids=False)) - elif (target_type := schema_to_pyarrow(field.field_type, include_field_ids=False)) != values.type: - # if file_field and field_type (e.g. String) are the same - # but the pyarrow type of the array is different from the expected type - # (e.g. string vs larger_string), we want to cast the array to the larger type - return values.cast(target_type) + return values.cast( + schema_to_pyarrow(promote(file_field.field_type, field.field_type), include_field_ids=self._include_field_ids) + ) + elif (target_type := schema_to_pyarrow(field.field_type, include_field_ids=self._include_field_ids)) != values.type: + if field.field_type == TimestampType(): + # Downcasting of nanoseconds to microseconds + if ( + pa.types.is_timestamp(target_type) + and not target_type.tz + and pa.types.is_timestamp(values.type) + and not values.type.tz + ): + if target_type.unit == "us" and values.type.unit == "ns" and self._downcast_ns_timestamp_to_us: + return values.cast(target_type, safe=False) + elif target_type.unit == "us" and values.type.unit in {"s", "ms"}: + return values.cast(target_type) + raise ValueError(f"Unsupported schema projection from {values.type} to {target_type}") + elif field.field_type == TimestamptzType(): + if ( + pa.types.is_timestamp(target_type) + and target_type.tz == "UTC" + and pa.types.is_timestamp(values.type) + and values.type.tz in UTC_ALIASES + ): + if target_type.unit == "us" and values.type.unit == "ns" and self._downcast_ns_timestamp_to_us: + return values.cast(target_type, safe=False) + elif target_type.unit == "us" and values.type.unit in {"s", "ms", "us"}: + return values.cast(target_type) + raise ValueError(f"Unsupported schema projection from {values.type} to {target_type}") return values def _construct_field(self, field: NestedField, arrow_type: pa.DataType) -> pa.Field: + metadata = {} + if field.doc: + metadata[PYARROW_FIELD_DOC_KEY] = field.doc + if self._include_field_ids: + metadata[PYARROW_PARQUET_FIELD_ID_KEY] = str(field.field_id) + return pa.field( name=field.name, type=arrow_type, nullable=field.optional, - metadata={DOC: field.doc} if field.doc is not None else None, + metadata=metadata, ) def schema(self, schema: Schema, schema_partner: Optional[pa.Array], struct_result: Optional[pa.Array]) -> Optional[pa.Array]: @@ -1319,7 +1408,7 @@ def list(self, list_type: ListType, list_array: Optional[pa.Array], value_array: # This can be removed once this has been fixed: # https://github.com/apache/arrow/issues/38809 list_array = pa.LargeListArray.from_arrays(list_array.offsets, value_array) - + value_array = self._cast_if_needed(list_type.element_field, value_array) arrow_field = pa.large_list(self._construct_field(list_type.element_field, value_array.type)) return list_array.cast(arrow_field) else: @@ -1329,6 +1418,8 @@ def map( self, map_type: MapType, map_array: Optional[pa.Array], key_result: Optional[pa.Array], value_result: Optional[pa.Array] ) -> Optional[pa.Array]: if isinstance(map_array, pa.MapArray) and key_result is not None and value_result is not None: + key_result = self._cast_if_needed(map_type.key_field, key_result) + value_result = self._cast_if_needed(map_type.value_field, value_result) arrow_field = pa.map_( self._construct_field(map_type.key_field, key_result.type), self._construct_field(map_type.value_field, value_result.type), @@ -1364,6 +1455,8 @@ def field_partner(self, partner_struct: Optional[pa.Array], field_id: int, _: st if isinstance(partner_struct, pa.StructArray): return partner_struct.field(name) + elif isinstance(partner_struct, pa.Table): + return partner_struct.column(name).combine_chunks() elif isinstance(partner_struct, pa.RecordBatch): return partner_struct.column(name) else: @@ -1459,9 +1552,16 @@ def __init__(self, iceberg_type: PrimitiveType, physical_type_string: str, trunc expected_physical_type = _primitive_to_physical(iceberg_type) if expected_physical_type != physical_type_string: - raise ValueError( - f"Unexpected physical type {physical_type_string} for {iceberg_type}, expected {expected_physical_type}" - ) + # Allow promotable physical types + # INT32 -> INT64 and FLOAT -> DOUBLE are safe type casts + if (physical_type_string == "INT32" and expected_physical_type == "INT64") or ( + physical_type_string == "FLOAT" and expected_physical_type == "DOUBLE" + ): + pass + else: + raise ValueError( + f"Unexpected physical type {physical_type_string} for {iceberg_type}, expected {expected_physical_type}" + ) self.primitive_type = iceberg_type @@ -1563,6 +1663,8 @@ class PyArrowStatisticsCollector(PreOrderSchemaVisitor[List[StatisticsCollector] _default_mode: str def __init__(self, schema: Schema, properties: Dict[str, str]): + from pyiceberg.table import TableProperties + self._schema = schema self._properties = properties self._default_mode = self._properties.get( @@ -1598,6 +1700,8 @@ def map( return k + v def primitive(self, primitive: PrimitiveType) -> List[StatisticsCollector]: + from pyiceberg.table import TableProperties + column_name = self._schema.find_column_name(self._field_id) if column_name is None: return [] @@ -1802,16 +1906,6 @@ def data_file_statistics_from_parquet_metadata( set the mode for column metrics collection parquet_column_mapping (Dict[str, int]): The mapping of the parquet file name to the field ID """ - if parquet_metadata.num_columns != len(stats_columns): - raise ValueError( - f"Number of columns in statistics configuration ({len(stats_columns)}) is different from the number of columns in pyarrow table ({parquet_metadata.num_columns})" - ) - - if parquet_metadata.num_columns != len(parquet_column_mapping): - raise ValueError( - f"Number of columns in column mapping ({len(parquet_column_mapping)}) is different from the number of columns in pyarrow table ({parquet_metadata.num_columns})" - ) - column_sizes: Dict[int, int] = {} value_counts: Dict[int, int] = {} split_offsets: List[int] = [] @@ -1821,6 +1915,7 @@ def data_file_statistics_from_parquet_metadata( col_aggs = {} + invalidate_col: Set[int] = set() for r in range(parquet_metadata.num_row_groups): # References: # https://github.com/apache/iceberg/blob/fc381a81a1fdb8f51a0637ca27cd30673bd7aad3/parquet/src/main/java/org/apache/iceberg/parquet/ParquetUtil.java#L232 @@ -1836,8 +1931,6 @@ def data_file_statistics_from_parquet_metadata( else: split_offsets.append(data_offset) - invalidate_col: Set[int] = set() - for pos in range(parquet_metadata.num_columns): column = row_group.column(pos) field_id = parquet_column_mapping[column.path_in_schema] @@ -1895,6 +1988,8 @@ def data_file_statistics_from_parquet_metadata( def write_file(io: FileIO, table_metadata: TableMetadata, tasks: Iterator[WriteTask]) -> Iterator[DataFile]: + from pyiceberg.table import DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE, PropertyUtil, TableProperties + parquet_writer_kwargs = _get_parquet_writer_kwargs(table_metadata.properties) row_group_size = PropertyUtil.property_as_int( properties=table_metadata.properties, @@ -1903,8 +1998,7 @@ def write_file(io: FileIO, table_metadata: TableMetadata, tasks: Iterator[WriteT ) def write_parquet(task: WriteTask) -> DataFile: - table_schema = task.schema - + table_schema = table_metadata.schema() # if schema needs to be transformed, use the transformed schema and adjust the arrow table accordingly # otherwise use the original schema if (sanitized_schema := sanitize_column_names(table_schema)) != table_schema: @@ -1912,15 +2006,22 @@ def write_parquet(task: WriteTask) -> DataFile: else: file_schema = table_schema + downcast_ns_timestamp_to_us = Config().get_bool(DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE) or False batches = [ - to_requested_schema(requested_schema=file_schema, file_schema=table_schema, batch=batch) + _to_requested_schema( + requested_schema=file_schema, + file_schema=task.schema, + batch=batch, + downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us, + include_field_ids=True, + ) for batch in task.record_batches ] arrow_table = pa.Table.from_batches(batches) file_path = f'{table_metadata.location}/data/{task.generate_data_file_path("parquet")}' fo = io.new_output(file_path) with fo.create(overwrite=True) as fos: - with pq.ParquetWriter(fos, schema=file_schema.as_arrow(), **parquet_writer_kwargs) as writer: + with pq.ParquetWriter(fos, schema=arrow_table.schema, **parquet_writer_kwargs) as writer: writer.write(arrow_table, row_group_size=row_group_size) statistics = data_file_statistics_from_parquet_metadata( parquet_metadata=writer.writer.metadata, @@ -1968,6 +2069,32 @@ def bin_pack_arrow_table(tbl: pa.Table, target_file_size: int) -> Iterator[List[ return bin_packed_record_batches +def _check_pyarrow_schema_compatible( + requested_schema: Schema, provided_schema: pa.Schema, downcast_ns_timestamp_to_us: bool = False +) -> None: + """ + Check if the `requested_schema` is compatible with `provided_schema`. + + Two schemas are considered compatible when they are equal in terms of the Iceberg Schema type. + + Raises: + ValueError: If the schemas are not compatible. + """ + name_mapping = requested_schema.name_mapping + try: + provided_schema = pyarrow_to_schema( + provided_schema, name_mapping=name_mapping, downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us + ) + except ValueError as e: + provided_schema = _pyarrow_to_schema_without_ids(provided_schema, downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us) + additional_names = set(provided_schema._name_to_id.keys()) - set(requested_schema._name_to_id.keys()) + raise ValueError( + f"PyArrow table contains more columns: {', '.join(sorted(additional_names))}. Update the schema first (hint, use union_by_name)." + ) from e + + _check_schema_compatible(requested_schema, provided_schema) + + def parquet_files_to_data_files(io: FileIO, table_metadata: TableMetadata, file_paths: Iterator[str]) -> Iterator[DataFile]: for file_path in file_paths: input_file = io.new_input(file_path) @@ -1979,6 +2106,8 @@ def parquet_files_to_data_files(io: FileIO, table_metadata: TableMetadata, file_ f"Cannot add file {file_path} because it has field IDs. `add_files` only supports addition of files without field_ids" ) schema = table_metadata.schema() + _check_pyarrow_schema_compatible(schema, parquet_metadata.schema.to_arrow_schema()) + statistics = data_file_statistics_from_parquet_metadata( parquet_metadata=parquet_metadata, stats_columns=compute_statistics_plan(schema, table_metadata.properties), @@ -2005,6 +2134,8 @@ def parquet_files_to_data_files(io: FileIO, table_metadata: TableMetadata, file_ def _get_parquet_writer_kwargs(table_properties: Properties) -> Dict[str, Any]: + from pyiceberg.table import PropertyUtil, TableProperties + for key_pattern in [ TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES, TableProperties.PARQUET_PAGE_ROW_LIMIT, @@ -2042,3 +2173,153 @@ def _get_parquet_writer_kwargs(table_properties: Properties) -> Dict[str, Any]: default=TableProperties.PARQUET_PAGE_ROW_LIMIT_DEFAULT, ), } + + +def _dataframe_to_data_files( + table_metadata: TableMetadata, + df: pa.Table, + io: FileIO, + write_uuid: Optional[uuid.UUID] = None, + counter: Optional[itertools.count[int]] = None, +) -> Iterable[DataFile]: + """Convert a PyArrow table into a DataFile. + + Returns: + An iterable that supplies datafiles that represent the table. + """ + from pyiceberg.table import DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE, PropertyUtil, TableProperties, WriteTask + + counter = counter or itertools.count(0) + write_uuid = write_uuid or uuid.uuid4() + target_file_size: int = PropertyUtil.property_as_int( # type: ignore # The property is set with non-None value. + properties=table_metadata.properties, + property_name=TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, + default=TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT, + ) + name_mapping = table_metadata.schema().name_mapping + downcast_ns_timestamp_to_us = Config().get_bool(DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE) or False + task_schema = pyarrow_to_schema(df.schema, name_mapping=name_mapping, downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us) + + if table_metadata.spec().is_unpartitioned(): + yield from write_file( + io=io, + table_metadata=table_metadata, + tasks=iter([ + WriteTask(write_uuid=write_uuid, task_id=next(counter), record_batches=batches, schema=task_schema) + for batches in bin_pack_arrow_table(df, target_file_size) + ]), + ) + else: + partitions = _determine_partitions(spec=table_metadata.spec(), schema=table_metadata.schema(), arrow_table=df) + yield from write_file( + io=io, + table_metadata=table_metadata, + tasks=iter([ + WriteTask( + write_uuid=write_uuid, + task_id=next(counter), + record_batches=batches, + partition_key=partition.partition_key, + schema=task_schema, + ) + for partition in partitions + for batches in bin_pack_arrow_table(partition.arrow_table_partition, target_file_size) + ]), + ) + + +@dataclass(frozen=True) +class _TablePartition: + partition_key: PartitionKey + arrow_table_partition: pa.Table + + +def _get_table_partitions( + arrow_table: pa.Table, + partition_spec: PartitionSpec, + schema: Schema, + slice_instructions: list[dict[str, Any]], +) -> list[_TablePartition]: + sorted_slice_instructions = sorted(slice_instructions, key=lambda x: x["offset"]) + + partition_fields = partition_spec.fields + + offsets = [inst["offset"] for inst in sorted_slice_instructions] + projected_and_filtered = { + partition_field.source_id: arrow_table[schema.find_field(name_or_id=partition_field.source_id).name] + .take(offsets) + .to_pylist() + for partition_field in partition_fields + } + + table_partitions = [] + for idx, inst in enumerate(sorted_slice_instructions): + partition_slice = arrow_table.slice(**inst) + fieldvalues = [ + PartitionFieldValue(partition_field, projected_and_filtered[partition_field.source_id][idx]) + for partition_field in partition_fields + ] + partition_key = PartitionKey(raw_partition_field_values=fieldvalues, partition_spec=partition_spec, schema=schema) + table_partitions.append(_TablePartition(partition_key=partition_key, arrow_table_partition=partition_slice)) + return table_partitions + + +def _determine_partitions(spec: PartitionSpec, schema: Schema, arrow_table: pa.Table) -> List[_TablePartition]: + """Based on the iceberg table partition spec, slice the arrow table into partitions with their keys. + + Example: + Input: + An arrow table with partition key of ['n_legs', 'year'] and with data of + {'year': [2020, 2022, 2022, 2021, 2022, 2022, 2022, 2019, 2021], + 'n_legs': [2, 2, 2, 4, 4, 4, 4, 5, 100], + 'animal': ["Flamingo", "Parrot", "Parrot", "Dog", "Horse", "Horse", "Horse","Brittle stars", "Centipede"]}. + The algorithm: + Firstly we group the rows into partitions by sorting with sort order [('n_legs', 'descending'), ('year', 'descending')] + and null_placement of "at_end". + This gives the same table as raw input. + Then we sort_indices using reverse order of [('n_legs', 'descending'), ('year', 'descending')] + and null_placement : "at_start". + This gives: + [8, 7, 4, 5, 6, 3, 1, 2, 0] + Based on this we get partition groups of indices: + [{'offset': 8, 'length': 1}, {'offset': 7, 'length': 1}, {'offset': 4, 'length': 3}, {'offset': 3, 'length': 1}, {'offset': 1, 'length': 2}, {'offset': 0, 'length': 1}] + We then retrieve the partition keys by offsets. + And slice the arrow table by offsets and lengths of each partition. + """ + partition_columns: List[Tuple[PartitionField, NestedField]] = [ + (partition_field, schema.find_field(partition_field.source_id)) for partition_field in spec.fields + ] + partition_values_table = pa.table({ + str(partition.field_id): partition.transform.pyarrow_transform(field.field_type)(arrow_table[field.name]) + for partition, field in partition_columns + }) + + # Sort by partitions + sort_indices = pa.compute.sort_indices( + partition_values_table, + sort_keys=[(col, "ascending") for col in partition_values_table.column_names], + null_placement="at_end", + ).to_pylist() + arrow_table = arrow_table.take(sort_indices) + + # Get slice_instructions to group by partitions + partition_values_table = partition_values_table.take(sort_indices) + reversed_indices = pa.compute.sort_indices( + partition_values_table, + sort_keys=[(col, "descending") for col in partition_values_table.column_names], + null_placement="at_start", + ).to_pylist() + slice_instructions: List[Dict[str, Any]] = [] + last = len(reversed_indices) + reversed_indices_size = len(reversed_indices) + ptr = 0 + while ptr < reversed_indices_size: + group_size = last - reversed_indices[ptr] + offset = reversed_indices[ptr] + slice_instructions.append({"offset": offset, "length": group_size}) + last = reversed_indices[ptr] + ptr = ptr + group_size + + table_partitions: List[_TablePartition] = _get_table_partitions(arrow_table, spec, schema, slice_instructions) + + return table_partitions diff --git a/pyiceberg/manifest.py b/pyiceberg/manifest.py index 50231762ef..385313aa5c 100644 --- a/pyiceberg/manifest.py +++ b/pyiceberg/manifest.py @@ -341,7 +341,7 @@ class DataFile(Record): split_offsets: Optional[List[int]] equality_ids: Optional[List[int]] sort_order_id: Optional[int] - spec_id: Optional[int] + spec_id: int def __setattr__(self, name: str, value: Any) -> None: """Assign a key/value to a DataFile.""" @@ -377,7 +377,7 @@ def __eq__(self, other: Any) -> bool: 2: Schema( NestedField(0, "status", IntegerType(), required=True), NestedField(1, "snapshot_id", LongType(), required=False), - NestedField(3, "data_sequence_number", LongType(), required=False), + NestedField(3, "sequence_number", LongType(), required=False), NestedField(4, "file_sequence_number", LongType(), required=False), NestedField(2, "data_file", DATA_FILE_TYPE[2], required=True), ), @@ -394,16 +394,54 @@ def manifest_entry_schema_with_data_file(format_version: TableVersion, data_file class ManifestEntry(Record): - __slots__ = ("status", "snapshot_id", "data_sequence_number", "file_sequence_number", "data_file") + __slots__ = ("status", "snapshot_id", "sequence_number", "file_sequence_number", "data_file") status: ManifestEntryStatus snapshot_id: Optional[int] - data_sequence_number: Optional[int] + sequence_number: Optional[int] file_sequence_number: Optional[int] data_file: DataFile def __init__(self, *data: Any, **named_data: Any) -> None: super().__init__(*data, **{"struct": MANIFEST_ENTRY_SCHEMAS_STRUCT[DEFAULT_READ_VERSION], **named_data}) + def _wrap( + self, + new_status: ManifestEntryStatus, + new_snapshot_id: Optional[int], + new_sequence_number: Optional[int], + new_file_sequence_number: Optional[int], + new_file: DataFile, + ) -> ManifestEntry: + self.status = new_status + self.snapshot_id = new_snapshot_id + self.sequence_number = new_sequence_number + self.file_sequence_number = new_file_sequence_number + self.data_file = new_file + return self + + def _wrap_append( + self, new_snapshot_id: Optional[int], new_sequence_number: Optional[int], new_file: DataFile + ) -> ManifestEntry: + return self._wrap(ManifestEntryStatus.ADDED, new_snapshot_id, new_sequence_number, None, new_file) + + def _wrap_delete( + self, + new_snapshot_id: Optional[int], + new_sequence_number: Optional[int], + new_file_sequence_number: Optional[int], + new_file: DataFile, + ) -> ManifestEntry: + return self._wrap(ManifestEntryStatus.DELETED, new_snapshot_id, new_sequence_number, new_file_sequence_number, new_file) + + def _wrap_existing( + self, + new_snapshot_id: Optional[int], + new_sequence_number: Optional[int], + new_file_sequence_number: Optional[int], + new_file: DataFile, + ) -> ManifestEntry: + return self._wrap(ManifestEntryStatus.EXISTING, new_snapshot_id, new_sequence_number, new_file_sequence_number, new_file) + PARTITION_FIELD_SUMMARY_TYPE = StructType( NestedField(509, "contains_null", BooleanType(), required=True), @@ -623,10 +661,10 @@ def _inherit_from_manifest(entry: ManifestEntry, manifest: ManifestFile) -> Mani if entry.snapshot_id is None: entry.snapshot_id = manifest.added_snapshot_id - # in v1 tables, the data sequence number is not persisted and can be safely defaulted to 0 - # in v2 tables, the data sequence number should be inherited iff the entry status is ADDED - if entry.data_sequence_number is None and (manifest.sequence_number == 0 or entry.status == ManifestEntryStatus.ADDED): - entry.data_sequence_number = manifest.sequence_number + # in v1 tables, the sequence number is not persisted and can be safely defaulted to 0 + # in v2 tables, the sequence number should be inherited iff the entry status is ADDED + if entry.sequence_number is None and (manifest.sequence_number == 0 or entry.status == ManifestEntryStatus.ADDED): + entry.sequence_number = manifest.sequence_number # in v1 tables, the file sequence number is not persisted and can be safely defaulted to 0 # in v2 tables, the file sequence number should be inherited iff the entry status is ADDED @@ -653,8 +691,9 @@ class ManifestWriter(ABC): _existing_rows: int _deleted_files: int _deleted_rows: int - _min_data_sequence_number: Optional[int] + _min_sequence_number: Optional[int] _partitions: List[Record] + _reused_entry_wrapper: ManifestEntry def __init__(self, spec: PartitionSpec, schema: Schema, output_file: OutputFile, snapshot_id: int) -> None: self.closed = False @@ -669,8 +708,9 @@ def __init__(self, spec: PartitionSpec, schema: Schema, output_file: OutputFile, self._existing_rows = 0 self._deleted_files = 0 self._deleted_rows = 0 - self._min_data_sequence_number = None + self._min_sequence_number = None self._partitions = [] + self._reused_entry_wrapper = ManifestEntry() def __enter__(self) -> ManifestWriter: """Open the writer.""" @@ -685,6 +725,10 @@ def __exit__( traceback: Optional[TracebackType], ) -> None: """Close the writer.""" + if (self._added_files + self._existing_files + self._deleted_files) == 0: + # This is just a guard to ensure that we don't write empty manifest files + raise ValueError("An empty manifest file has been written") + self.closed = True self._writer.__exit__(exc_type, exc_value, traceback) @@ -726,7 +770,7 @@ def to_manifest_file(self) -> ManifestFile: """Return the manifest file.""" # once the manifest file is generated, no more entries can be added self.closed = True - min_sequence_number = self._min_data_sequence_number or UNASSIGNED_SEQ + min_sequence_number = self._min_sequence_number or UNASSIGNED_SEQ return ManifestFile( manifest_path=self._output_file.location, manifest_length=len(self._writer.output_file), @@ -757,19 +801,46 @@ def add_entry(self, entry: ManifestEntry) -> ManifestWriter: elif entry.status == ManifestEntryStatus.DELETED: self._deleted_files += 1 self._deleted_rows += entry.data_file.record_count + else: + raise ValueError(f"Unknown entry: {entry.status}") self._partitions.append(entry.data_file.partition) if ( (entry.status == ManifestEntryStatus.ADDED or entry.status == ManifestEntryStatus.EXISTING) - and entry.data_sequence_number is not None - and (self._min_data_sequence_number is None or entry.data_sequence_number < self._min_data_sequence_number) + and entry.sequence_number is not None + and (self._min_sequence_number is None or entry.sequence_number < self._min_sequence_number) ): - self._min_data_sequence_number = entry.data_sequence_number + self._min_sequence_number = entry.sequence_number self._writer.write_block([self.prepare_entry(entry)]) return self + + def add(self, entry: ManifestEntry) -> ManifestWriter: + if entry.sequence_number is not None and entry.sequence_number >= 0: + self.add_entry(self._reused_entry_wrapper._wrap_append(self._snapshot_id, entry.sequence_number, entry.data_file)) + else: + self.add_entry(self._reused_entry_wrapper._wrap_append(self._snapshot_id, None, entry.data_file)) + return self + + def delete(self, entry: ManifestEntry) -> ManifestWriter: + self.add_entry( + self._reused_entry_wrapper._wrap_delete( + self._snapshot_id, entry.sequence_number, entry.file_sequence_number, entry.data_file + ) + ) + return self + + def existing(self, entry: ManifestEntry) -> ManifestWriter: + self.add_entry( + self._reused_entry_wrapper._wrap_existing( + entry.snapshot_id, entry.sequence_number, entry.file_sequence_number, entry.data_file + ) + ) + return self + + def __len__(self) -> int: """Return the total number number of bytes written.""" return len(self._writer) @@ -884,7 +955,7 @@ def _meta(self) -> Dict[str, str]: } def prepare_entry(self, entry: ManifestEntry) -> ManifestEntry: - if entry.data_sequence_number is None: + if entry.sequence_number is None: if entry.snapshot_id is not None and entry.snapshot_id != self._snapshot_id: raise ValueError(f"Found unassigned sequence number for an entry from snapshot: {entry.snapshot_id}") if entry.status != ManifestEntryStatus.ADDED: diff --git a/pyiceberg/schema.py b/pyiceberg/schema.py index 77f1addbf5..cfe3fe3a7b 100644 --- a/pyiceberg/schema.py +++ b/pyiceberg/schema.py @@ -1616,3 +1616,103 @@ def _(file_type: FixedType, read_type: IcebergType) -> IcebergType: return read_type else: raise ResolveError(f"Cannot promote {file_type} to {read_type}") + + +def _check_schema_compatible(requested_schema: Schema, provided_schema: Schema) -> None: + """ + Check if the `provided_schema` is compatible with `requested_schema`. + + Both Schemas must have valid IDs and share the same ID for the same field names. + + Two schemas are considered compatible when: + 1. All `required` fields in `requested_schema` are present and are also `required` in the `provided_schema` + 2. Field Types are consistent for fields that are present in both schemas. I.e. the field type + in the `provided_schema` can be promoted to the field type of the same field ID in `requested_schema` + + Raises: + ValueError: If the schemas are not compatible. + """ + pre_order_visit(requested_schema, _SchemaCompatibilityVisitor(provided_schema)) + + +class _SchemaCompatibilityVisitor(PreOrderSchemaVisitor[bool]): + provided_schema: Schema + + def __init__(self, provided_schema: Schema): + from rich.console import Console + from rich.table import Table as RichTable + + self.provided_schema = provided_schema + self.rich_table = RichTable(show_header=True, header_style="bold") + self.rich_table.add_column("") + self.rich_table.add_column("Table field") + self.rich_table.add_column("Dataframe field") + self.console = Console(record=True) + + def _is_field_compatible(self, lhs: NestedField) -> bool: + # Validate nullability first. + # An optional field can be missing in the provided schema + # But a required field must exist as a required field + try: + rhs = self.provided_schema.find_field(lhs.field_id) + except ValueError: + if lhs.required: + self.rich_table.add_row("❌", str(lhs), "Missing") + return False + else: + self.rich_table.add_row("✅", str(lhs), "Missing") + return True + + if lhs.required and not rhs.required: + self.rich_table.add_row("❌", str(lhs), str(rhs)) + return False + + # Check type compatibility + if lhs.field_type == rhs.field_type: + self.rich_table.add_row("✅", str(lhs), str(rhs)) + return True + # We only check that the parent node is also of the same type. + # We check the type of the child nodes when we traverse them later. + elif any( + (isinstance(lhs.field_type, container_type) and isinstance(rhs.field_type, container_type)) + for container_type in {StructType, MapType, ListType} + ): + self.rich_table.add_row("✅", str(lhs), str(rhs)) + return True + else: + try: + # If type can be promoted to the requested schema + # it is considered compatible + promote(rhs.field_type, lhs.field_type) + self.rich_table.add_row("✅", str(lhs), str(rhs)) + return True + except ResolveError: + self.rich_table.add_row("❌", str(lhs), str(rhs)) + return False + + def schema(self, schema: Schema, struct_result: Callable[[], bool]) -> bool: + if not (result := struct_result()): + self.console.print(self.rich_table) + raise ValueError(f"Mismatch in fields:\n{self.console.export_text()}") + return result + + def struct(self, struct: StructType, field_results: List[Callable[[], bool]]) -> bool: + results = [result() for result in field_results] + return all(results) + + def field(self, field: NestedField, field_result: Callable[[], bool]) -> bool: + return self._is_field_compatible(field) and field_result() + + def list(self, list_type: ListType, element_result: Callable[[], bool]) -> bool: + return self._is_field_compatible(list_type.element_field) and element_result() + + def map(self, map_type: MapType, key_result: Callable[[], bool], value_result: Callable[[], bool]) -> bool: + return all([ + self._is_field_compatible(map_type.key_field), + self._is_field_compatible(map_type.value_field), + key_result(), + value_result(), + ]) + + def primitive(self, primitive: PrimitiveType) -> bool: + return True diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py index 2eec4d3036..0b211e673d 100644 --- a/pyiceberg/table/__init__.py +++ b/pyiceberg/table/__init__.py @@ -16,10 +16,13 @@ # under the License. from __future__ import annotations +import concurrent import itertools import uuid import warnings from abc import ABC, abstractmethod +from collections import defaultdict +from concurrent.futures import Future from copy import copy from dataclasses import dataclass from datetime import datetime @@ -50,19 +53,26 @@ from pyiceberg.conversions import from_bytes from pyiceberg.exceptions import CommitFailedException, ResolveError, ValidationError from pyiceberg.expressions import ( + AlwaysFalse, AlwaysTrue, And, BooleanExpression, EqualTo, + Not, + Or, Reference, ) from pyiceberg.expressions.visitors import ( + ROWS_CANNOT_MATCH, + ROWS_MUST_MATCH, _InclusiveMetricsEvaluator, + _StrictMetricsEvaluator, + bind, expression_evaluator, inclusive_projection, manifest_evaluator, ) -from pyiceberg.io import FileIO, load_file_io +from pyiceberg.io import FileIO, OutputFile, load_file_io from pyiceberg.manifest import ( POSITIONAL_DELETE_SCHEMA, DataFile, @@ -71,6 +81,7 @@ ManifestEntry, ManifestEntryStatus, ManifestFile, + ManifestWriter, PartitionFieldSummary, write_manifest, write_manifest_list, @@ -80,7 +91,6 @@ PARTITION_FIELD_ID_START, UNPARTITIONED_PARTITION_SPEC, PartitionField, - PartitionFieldValue, PartitionKey, PartitionSpec, _PartitionNameGenerator, @@ -137,7 +147,9 @@ StructType, transform_dict_value_to_str, ) +from pyiceberg.utils.bin_packing import ListPacker from pyiceberg.utils.concurrent import ExecutorFactory +from pyiceberg.utils.config import Config from pyiceberg.utils.datetime import datetime_to_millis from pyiceberg.utils.deprecated import deprecated from pyiceberg.utils.singleton import _convert_to_hashable_type @@ -153,51 +165,8 @@ ALWAYS_TRUE = AlwaysTrue() TABLE_ROOT_ID = -1 - _JAVA_LONG_MAX = 9223372036854775807 - - -def _check_schema_compatible(table_schema: Schema, other_schema: "pa.Schema") -> None: - """ - Check if the `table_schema` is compatible with `other_schema`. - - Two schemas are considered compatible when they are equal in terms of the Iceberg Schema type. - - Raises: - ValueError: If the schemas are not compatible. - """ - from pyiceberg.io.pyarrow import _pyarrow_to_schema_without_ids, pyarrow_to_schema - - name_mapping = table_schema.name_mapping - try: - task_schema = pyarrow_to_schema(other_schema, name_mapping=name_mapping) - except ValueError as e: - other_schema = _pyarrow_to_schema_without_ids(other_schema) - additional_names = set(other_schema.column_names) - set(table_schema.column_names) - raise ValueError( - f"PyArrow table contains more columns: {', '.join(sorted(additional_names))}. Update the schema first (hint, use union_by_name)." - ) from e - - if table_schema.as_struct() != task_schema.as_struct(): - from rich.console import Console - from rich.table import Table as RichTable - - console = Console(record=True) - - rich_table = RichTable(show_header=True, header_style="bold") - rich_table.add_column("") - rich_table.add_column("Table field") - rich_table.add_column("Dataframe field") - - for lhs in table_schema.fields: - try: - rhs = task_schema.find_field(lhs.field_id) - rich_table.add_row("✅" if lhs == rhs else "❌", str(lhs), str(rhs)) - except ValueError: - rich_table.add_row("❌", str(lhs), "Missing") - - console.print(rich_table) - raise ValueError(f"Mismatch in fields:\n{console.export_text()}") +DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE = "downcast-ns-timestamp-to-us-on-write" class TableProperties: @@ -238,10 +207,24 @@ class TableProperties: WRITE_PARTITION_SUMMARY_LIMIT = "write.summary.partition-limit" WRITE_PARTITION_SUMMARY_LIMIT_DEFAULT = 0 + DELETE_MODE = "write.delete.mode" + DELETE_MODE_COPY_ON_WRITE = "copy-on-write" + DELETE_MODE_MERGE_ON_READ = "merge-on-read" + DELETE_MODE_DEFAULT = DELETE_MODE_COPY_ON_WRITE + DEFAULT_NAME_MAPPING = "schema.name-mapping.default" FORMAT_VERSION = "format-version" DEFAULT_FORMAT_VERSION = 2 + MANIFEST_TARGET_SIZE_BYTES = "commit.manifest.target-size-bytes" + MANIFEST_TARGET_SIZE_BYTES_DEFAULT = 8 * 1024 * 1024 # 8 MB + + MANIFEST_MIN_MERGE_COUNT = "commit.manifest.min-count-to-merge" + MANIFEST_MIN_MERGE_COUNT_DEFAULT = 100 + + MANIFEST_MERGE_ENABLED = "commit.manifest-merge.enabled" + MANIFEST_MERGE_ENABLED_DEFAULT = False + class PropertyUtil: @staticmethod @@ -305,7 +288,13 @@ def _apply(self, updates: Tuple[TableUpdate, ...], requirements: Tuple[TableRequ requirement.validate(self.table_metadata) self._updates += updates - self._requirements += requirements + + # For the requirements, it does not make sense to add a requirement more than once + # For example, you cannot assert that the current schema has two different IDs + existing_requirements = {type(requirement) for requirement in self._requirements} + for new_requirement in requirements: + if type(new_requirement) not in existing_requirements: + self._requirements = self._requirements + requirements self.table_metadata = update_table_metadata(self.table_metadata, updates) @@ -316,6 +305,14 @@ def _apply(self, updates: Tuple[TableUpdate, ...], requirements: Tuple[TableRequ return self + def _scan(self, row_filter: Union[str, BooleanExpression] = ALWAYS_TRUE) -> DataScan: + """Minimal data scan the table with the current state of the transaction.""" + return DataScan( + table_metadata=self.table_metadata, + io=self._table.io, + row_filter=row_filter, + ) + def upgrade_table_version(self, format_version: TableVersion) -> Transaction: """Set the table to a certain version. @@ -473,6 +470,8 @@ def append(self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT) except ModuleNotFoundError as e: raise ModuleNotFoundError("For writes PyArrow needs to be installed") from e + from pyiceberg.io.pyarrow import _check_pyarrow_schema_compatible, _dataframe_to_data_files + if not isinstance(df, pa.Table): raise ValueError(f"Expected PyArrow table, got: {df}") @@ -482,28 +481,43 @@ def append(self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT) raise ValueError( f"Not all partition types are supported for writes. Following partitions cannot be written using pyarrow: {unsupported_partitions}." ) + downcast_ns_timestamp_to_us = Config().get_bool(DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE) or False + _check_pyarrow_schema_compatible( + self._table.schema(), provided_schema=df.schema, downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us + ) - _check_schema_compatible(self._table.schema(), other_schema=df.schema) - # cast if the two schemas are compatible but not equal - table_arrow_schema = self._table.schema().as_arrow() - if table_arrow_schema != df.schema: - df = df.cast(table_arrow_schema) + manifest_merge_enabled = PropertyUtil.property_as_bool( + self.table_metadata.properties, + TableProperties.MANIFEST_MERGE_ENABLED, + TableProperties.MANIFEST_MERGE_ENABLED_DEFAULT, + ) + update_snapshot = self.update_snapshot(snapshot_properties=snapshot_properties) + append_method = update_snapshot.merge_append if manifest_merge_enabled else update_snapshot.fast_append - with self.update_snapshot(snapshot_properties=snapshot_properties).fast_append() as update_snapshot: + with append_method() as append_files: # skip writing data files if the dataframe is empty if df.shape[0] > 0: data_files = _dataframe_to_data_files( - table_metadata=self._table.metadata, write_uuid=update_snapshot.commit_uuid, df=df, io=self._table.io + table_metadata=self._table.metadata, write_uuid=append_files.commit_uuid, df=df, io=self._table.io ) for data_file in data_files: - update_snapshot.append_data_file(data_file) + append_files.append_data_file(data_file) def overwrite( - self, df: pa.Table, overwrite_filter: BooleanExpression = ALWAYS_TRUE, snapshot_properties: Dict[str, str] = EMPTY_DICT + self, + df: pa.Table, + overwrite_filter: Union[BooleanExpression, str] = ALWAYS_TRUE, + snapshot_properties: Dict[str, str] = EMPTY_DICT, ) -> None: """ Shorthand for adding a table overwrite with a PyArrow table to the transaction. + An overwrite may produce zero or more snapshots based on the operation: + + - DELETE: In case existing Parquet files can be dropped completely. + - REPLACE: In case existing Parquet files need to be rewritten. + - APPEND: In case new data is being inserted into the table. + Args: df: The Arrow dataframe that will be used to overwrite the table overwrite_filter: ALWAYS_TRUE when you overwrite all the data, @@ -515,22 +529,25 @@ def overwrite( except ModuleNotFoundError as e: raise ModuleNotFoundError("For writes PyArrow needs to be installed") from e + from pyiceberg.io.pyarrow import _check_pyarrow_schema_compatible, _dataframe_to_data_files + if not isinstance(df, pa.Table): raise ValueError(f"Expected PyArrow table, got: {df}") - if overwrite_filter != AlwaysTrue(): - raise NotImplementedError("Cannot overwrite a subset of a table") - - if len(self._table.spec().fields) > 0: - raise ValueError("Cannot write to partitioned tables") + if unsupported_partitions := [ + field for field in self.table_metadata.spec().fields if not field.transform.supports_pyarrow_transform + ]: + raise ValueError( + f"Not all partition types are supported for writes. Following partitions cannot be written using pyarrow: {unsupported_partitions}." + ) + downcast_ns_timestamp_to_us = Config().get_bool(DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE) or False + _check_pyarrow_schema_compatible( + self._table.schema(), provided_schema=df.schema, downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us + ) - _check_schema_compatible(self._table.schema(), other_schema=df.schema) - # cast if the two schemas are compatible but not equal - table_arrow_schema = self._table.schema().as_arrow() - if table_arrow_schema != df.schema: - df = df.cast(table_arrow_schema) + self.delete(delete_filter=overwrite_filter, snapshot_properties=snapshot_properties) - with self.update_snapshot(snapshot_properties=snapshot_properties).overwrite() as update_snapshot: + with self.update_snapshot(snapshot_properties=snapshot_properties).fast_append() as update_snapshot: # skip writing data files if the dataframe is empty if df.shape[0] > 0: data_files = _dataframe_to_data_files( @@ -539,6 +556,88 @@ def overwrite( for data_file in data_files: update_snapshot.append_data_file(data_file) + def delete(self, delete_filter: Union[str, BooleanExpression], snapshot_properties: Dict[str, str] = EMPTY_DICT) -> None: + """ + Shorthand for deleting record from a table. + + An deletee may produce zero or more snapshots based on the operation: + + - DELETE: In case existing Parquet files can be dropped completely. + - REPLACE: In case existing Parquet files need to be rewritten + + Args: + delete_filter: A boolean expression to delete rows from a table + snapshot_properties: Custom properties to be added to the snapshot summary + """ + from pyiceberg.io.pyarrow import _dataframe_to_data_files, expression_to_pyarrow, project_table + + if ( + self.table_metadata.properties.get(TableProperties.DELETE_MODE, TableProperties.DELETE_MODE_DEFAULT) + == TableProperties.DELETE_MODE_MERGE_ON_READ + ): + warnings.warn("Merge on read is not yet supported, falling back to copy-on-write") + + if isinstance(delete_filter, str): + delete_filter = _parse_row_filter(delete_filter) + + with self.update_snapshot(snapshot_properties=snapshot_properties).delete() as delete_snapshot: + delete_snapshot.delete_by_predicate(delete_filter) + + # Check if there are any files that require an actual rewrite of a data file + if delete_snapshot.rewrites_needed is True: + bound_delete_filter = bind(self._table.schema(), delete_filter, case_sensitive=True) + preserve_row_filter = expression_to_pyarrow(Not(bound_delete_filter)) + + files = self._scan(row_filter=delete_filter).plan_files() + + commit_uuid = uuid.uuid4() + counter = itertools.count(0) + + replaced_files: List[Tuple[DataFile, List[DataFile]]] = [] + # This will load the Parquet file into memory, including: + # - Filter out the rows based on the delete filter + # - Projecting it to the current schema + # - Applying the positional deletes if they are there + # When writing + # - Apply the latest partition-spec + # - And sort order when added + for original_file in files: + df = project_table( + tasks=[original_file], + table_metadata=self._table.metadata, + io=self._table.io, + row_filter=AlwaysTrue(), + projected_schema=self.table_metadata.schema(), + ) + filtered_df = df.filter(preserve_row_filter) + + # Only rewrite if there are records being deleted + if len(df) != len(filtered_df): + replaced_files.append(( + original_file.file, + list( + _dataframe_to_data_files( + io=self._table.io, + df=filtered_df, + table_metadata=self._table.metadata, + write_uuid=commit_uuid, + counter=counter, + ) + ), + )) + + if len(replaced_files) > 0: + with self.update_snapshot(snapshot_properties=snapshot_properties).overwrite( + commit_uuid=commit_uuid + ) as overwrite_snapshot: + for original_data_file, replaced_data_files in replaced_files: + overwrite_snapshot.delete_data_file(original_data_file) + for replaced_data_file in replaced_data_files: + overwrite_snapshot.append_data_file(replaced_data_file) + + if not delete_snapshot.files_affected and not delete_snapshot.rewrites_needed: + warnings.warn("Delete operation did not match any records") + def add_files(self, file_paths: List[str], snapshot_properties: Dict[str, str] = EMPTY_DICT) -> None: """ Shorthand API for adding files as data files to the table transaction. @@ -1381,6 +1480,9 @@ def current_snapshot(self) -> Optional[Snapshot]: return self.snapshot_by_id(self.metadata.current_snapshot_id) return None + def snapshots(self) -> List[Snapshot]: + return self.metadata.snapshots + def snapshot_by_id(self, snapshot_id: int) -> Optional[Snapshot]: """Get the snapshot of this table with the given id, or None if there is no matching snapshot.""" return self.metadata.snapshot_by_id(snapshot_id) @@ -1455,11 +1557,20 @@ def append(self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT) tx.append(df=df, snapshot_properties=snapshot_properties) def overwrite( - self, df: pa.Table, overwrite_filter: BooleanExpression = ALWAYS_TRUE, snapshot_properties: Dict[str, str] = EMPTY_DICT + self, + df: pa.Table, + overwrite_filter: Union[BooleanExpression, str] = ALWAYS_TRUE, + snapshot_properties: Dict[str, str] = EMPTY_DICT, ) -> None: """ Shorthand for overwriting the table with a PyArrow table. + An overwrite may produce zero or more snapshots based on the operation: + + - DELETE: In case existing Parquet files can be dropped completely. + - REPLACE: In case existing Parquet files need to be rewritten. + - APPEND: In case new data is being inserted into the table. + Args: df: The Arrow dataframe that will be used to overwrite the table overwrite_filter: ALWAYS_TRUE when you overwrite all the data, @@ -1469,6 +1580,19 @@ def overwrite( with self.transaction() as tx: tx.overwrite(df=df, overwrite_filter=overwrite_filter, snapshot_properties=snapshot_properties) + def delete( + self, delete_filter: Union[BooleanExpression, str] = ALWAYS_TRUE, snapshot_properties: Dict[str, str] = EMPTY_DICT + ) -> None: + """ + Shorthand for deleting rows from the table. + + Args: + delete_filter: The predicate that used to remove rows + snapshot_properties: Custom properties to be added to the snapshot summary + """ + with self.transaction() as tx: + tx.delete(delete_filter=delete_filter, snapshot_properties=snapshot_properties) + def add_files(self, file_paths: List[str], snapshot_properties: Dict[str, str] = EMPTY_DICT) -> None: """ Shorthand API for adding files as data files to the table. @@ -1718,7 +1842,7 @@ def _open_manifest( ] -def _min_data_file_sequence_number(manifests: List[ManifestFile]) -> int: +def _min_sequence_number(manifests: List[ManifestFile]) -> int: try: return min( manifest.min_sequence_number or INITIAL_SEQUENCE_NUMBER @@ -1779,11 +1903,11 @@ def _build_partition_evaluator(self, spec_id: int) -> Callable[[DataFile], bool] # shared instance across multiple threads. return lambda data_file: expression_evaluator(partition_schema, partition_expr, self.case_sensitive)(data_file.partition) - def _check_sequence_number(self, min_data_sequence_number: int, manifest: ManifestFile) -> bool: + def _check_sequence_number(self, min_sequence_number: int, manifest: ManifestFile) -> bool: """Ensure that no manifests are loaded that contain deletes that are older than the data. Args: - min_data_sequence_number (int): The minimal sequence number. + min_sequence_number (int): The minimal sequence number. manifest (ManifestFile): A ManifestFile that can be either data or deletes. Returns: @@ -1792,7 +1916,7 @@ def _check_sequence_number(self, min_data_sequence_number: int, manifest: Manife return manifest.content == ManifestContent.DATA or ( # Not interested in deletes that are older than the data manifest.content == ManifestContent.DELETES - and (manifest.sequence_number or INITIAL_SEQUENCE_NUMBER) >= min_data_sequence_number + and (manifest.sequence_number or INITIAL_SEQUENCE_NUMBER) >= min_sequence_number ) def plan_files(self) -> Iterable[FileScanTask]: @@ -1824,10 +1948,10 @@ def plan_files(self) -> Iterable[FileScanTask]: self.table_metadata.schema(), self.row_filter, self.case_sensitive, self.options.get("include_empty_files") == "true" ).eval - min_data_sequence_number = _min_data_file_sequence_number(manifests) + min_sequence_number = _min_sequence_number(manifests) data_entries: List[ManifestEntry] = [] - positional_delete_entries = SortedList(key=lambda entry: entry.data_sequence_number or INITIAL_SEQUENCE_NUMBER) + positional_delete_entries = SortedList(key=lambda entry: entry.sequence_number or INITIAL_SEQUENCE_NUMBER) executor = ExecutorFactory.get_or_create() for manifest_entry in chain( @@ -1841,7 +1965,7 @@ def plan_files(self) -> Iterable[FileScanTask]: metrics_evaluator, ) for manifest in manifests - if self._check_sequence_number(min_data_sequence_number, manifest) + if self._check_sequence_number(min_sequence_number, manifest) ], ) ): @@ -1884,8 +2008,9 @@ def to_arrow_batch_reader(self) -> pa.RecordBatchReader: from pyiceberg.io.pyarrow import project_batches, schema_to_pyarrow + target_schema = schema_to_pyarrow(self.projection()) return pa.RecordBatchReader.from_batches( - schema_to_pyarrow(self.projection()), + target_schema, project_batches( self.plan_files(), self.table_metadata, @@ -2904,52 +3029,6 @@ def _generate_manifest_list_path(location: str, snapshot_id: int, attempt: int, return f"{location}/metadata/snap-{snapshot_id}-{attempt}-{commit_uuid}.avro" -def _dataframe_to_data_files( - table_metadata: TableMetadata, df: pa.Table, io: FileIO, write_uuid: Optional[uuid.UUID] = None -) -> Iterable[DataFile]: - """Convert a PyArrow table into a DataFile. - - Returns: - An iterable that supplies datafiles that represent the table. - """ - from pyiceberg.io.pyarrow import bin_pack_arrow_table, write_file - - counter = itertools.count(0) - write_uuid = write_uuid or uuid.uuid4() - target_file_size: int = PropertyUtil.property_as_int( # type: ignore # The property is set with non-None value. - properties=table_metadata.properties, - property_name=TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, - default=TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT, - ) - - if len(table_metadata.spec().fields) > 0: - partitions = _determine_partitions(spec=table_metadata.spec(), schema=table_metadata.schema(), arrow_table=df) - yield from write_file( - io=io, - table_metadata=table_metadata, - tasks=iter([ - WriteTask( - write_uuid=write_uuid, - task_id=next(counter), - record_batches=batches, - partition_key=partition.partition_key, - schema=table_metadata.schema(), - ) - for partition in partitions - for batches in bin_pack_arrow_table(partition.arrow_table_partition, target_file_size) - ]), - ) - else: - yield from write_file( - io=io, - table_metadata=table_metadata, - tasks=iter([ - WriteTask(write_uuid=write_uuid, task_id=next(counter), record_batches=batches, schema=table_metadata.schema()) - for batches in bin_pack_arrow_table(df, target_file_size) - ]), - ) - - def _parquet_files_to_data_files(table_metadata: TableMetadata, file_paths: List[str], io: FileIO) -> Iterable[DataFile]: """Convert a list files into DataFiles. @@ -2961,12 +3040,15 @@ def _parquet_files_to_data_files(table_metadata: TableMetadata, file_paths: List yield from parquet_files_to_data_files(io=io, table_metadata=table_metadata, file_paths=iter(file_paths)) -class _MergingSnapshotProducer(UpdateTableMetadata["_MergingSnapshotProducer"]): +class _SnapshotProducer(UpdateTableMetadata[U], Generic[U]): commit_uuid: uuid.UUID + _io: FileIO _operation: Operation _snapshot_id: int _parent_snapshot_id: Optional[int] _added_data_files: List[DataFile] + _manifest_num_counter: itertools.count[int] + _deleted_data_files: Set[DataFile] def __init__( self, @@ -2986,37 +3068,44 @@ def __init__( snapshot.snapshot_id if (snapshot := self._transaction.table_metadata.current_snapshot()) else None ) self._added_data_files = [] + self._deleted_data_files = set() self.snapshot_properties = snapshot_properties + self._manifest_num_counter = itertools.count(0) - def append_data_file(self, data_file: DataFile) -> _MergingSnapshotProducer: + def append_data_file(self, data_file: DataFile) -> _SnapshotProducer[U]: self._added_data_files.append(data_file) return self + def delete_data_file(self, data_file: DataFile) -> _SnapshotProducer[U]: + self._deleted_data_files.add(data_file) + return self + @abstractmethod def _deleted_entries(self) -> List[ManifestEntry]: ... @abstractmethod def _existing_manifests(self) -> List[ManifestFile]: ... + def _process_manifests(self, manifests: List[ManifestFile]) -> List[ManifestFile]: + """To perform any post-processing on the manifests before writing them to the new snapshot.""" + return manifests + def _manifests(self) -> List[ManifestFile]: def _write_added_manifest() -> List[ManifestFile]: if self._added_data_files: - output_file_location = _new_manifest_path( - location=self._transaction.table_metadata.location, num=0, commit_uuid=self.commit_uuid - ) with write_manifest( format_version=self._transaction.table_metadata.format_version, spec=self._transaction.table_metadata.spec(), schema=self._transaction.table_metadata.schema(), - output_file=self._io.new_output(output_file_location), + output_file=self.new_manifest_output(), snapshot_id=self._snapshot_id, ) as writer: for data_file in self._added_data_files: - writer.add_entry( + writer.add( ManifestEntry( status=ManifestEntryStatus.ADDED, snapshot_id=self._snapshot_id, - data_sequence_number=None, + sequence_number=None, file_sequence_number=None, data_file=data_file, ) @@ -3029,15 +3118,11 @@ def _write_delete_manifest() -> List[ManifestFile]: # Check if we need to mark the files as deleted deleted_entries = self._deleted_entries() if len(deleted_entries) > 0: - output_file_location = _new_manifest_path( - location=self._transaction.table_metadata.location, num=1, commit_uuid=self.commit_uuid - ) - with write_manifest( format_version=self._transaction.table_metadata.format_version, spec=self._transaction.table_metadata.spec(), schema=self._transaction.table_metadata.schema(), - output_file=self._io.new_output(output_file_location), + output_file=self.new_manifest_output(), snapshot_id=self._snapshot_id, ) as writer: for delete_entry in deleted_entries: @@ -3052,7 +3137,7 @@ def _write_delete_manifest() -> List[ManifestFile]: delete_manifests = executor.submit(_write_delete_manifest) existing_manifests = executor.submit(self._existing_manifests) - return added_manifests.result() + delete_manifests.result() + existing_manifests.result() + return self._process_manifests(added_manifests.result() + delete_manifests.result() + existing_manifests.result()) def _summary(self, snapshot_properties: Dict[str, str] = EMPTY_DICT) -> Summary: ssc = SnapshotSummaryCollector() @@ -3070,6 +3155,15 @@ def _summary(self, snapshot_properties: Dict[str, str] = EMPTY_DICT) -> Summary: schema=self._transaction.table_metadata.schema(), ) + if len(self._deleted_data_files) > 0: + specs = self._transaction.table_metadata.specs() + for data_file in self._deleted_data_files: + ssc.remove_file( + data_file=data_file, + partition_spec=specs[data_file.spec_id], + schema=self._transaction.table_metadata.schema(), + ) + previous_snapshot = ( self._transaction.table_metadata.snapshot_by_id(self._parent_snapshot_id) if self._parent_snapshot_id is not None @@ -3119,11 +3213,176 @@ def _commit(self) -> UpdatesAndRequirements: snapshot_id=self._snapshot_id, parent_snapshot_id=self._parent_snapshot_id, ref_name="main", type="branch" ), ), - (AssertRefSnapshotId(snapshot_id=self._parent_snapshot_id, ref="main"),), + (AssertRefSnapshotId(snapshot_id=self._transaction.table_metadata.current_snapshot_id, ref="main"),), ) + @property + def snapshot_id(self) -> int: + return self._snapshot_id + + def spec(self, spec_id: int) -> PartitionSpec: + return self._transaction.table_metadata.specs()[spec_id] -class FastAppendFiles(_MergingSnapshotProducer): + def new_manifest_writer(self, spec: PartitionSpec) -> ManifestWriter: + return write_manifest( + format_version=self._transaction.table_metadata.format_version, + spec=spec, + schema=self._transaction.table_metadata.schema(), + output_file=self.new_manifest_output(), + snapshot_id=self._snapshot_id, + ) + + def new_manifest_output(self) -> OutputFile: + return self._io.new_output( + _new_manifest_path( + location=self._transaction.table_metadata.location, + num=next(self._manifest_num_counter), + commit_uuid=self.commit_uuid, + ) + ) + + def fetch_manifest_entry(self, manifest: ManifestFile, discard_deleted: bool = True) -> List[ManifestEntry]: + return manifest.fetch_manifest_entry(io=self._io, discard_deleted=discard_deleted) + + +class DeleteFiles(_SnapshotProducer["DeleteFiles"]): + """Will delete manifest entries from the current snapshot based on the predicate. + + This will produce a DELETE snapshot: + Data files were removed and their contents logically deleted and/or delete + files were added to delete rows. + + From the specification + """ + + _predicate: BooleanExpression + + def __init__( + self, + operation: Operation, + transaction: Transaction, + io: FileIO, + commit_uuid: Optional[uuid.UUID] = None, + snapshot_properties: Dict[str, str] = EMPTY_DICT, + ): + super().__init__(operation, transaction, io, commit_uuid, snapshot_properties) + self._predicate = AlwaysFalse() + + def _commit(self) -> UpdatesAndRequirements: + # Only produce a commit when there is something to delete + if self.files_affected: + return super()._commit() + else: + return (), () + + def _build_partition_projection(self, spec_id: int) -> BooleanExpression: + schema = self._transaction.table_metadata.schema() + spec = self._transaction.table_metadata.specs()[spec_id] + project = inclusive_projection(schema, spec) + return project(self._predicate) + + @cached_property + def partition_filters(self) -> KeyDefaultDict[int, BooleanExpression]: + return KeyDefaultDict(self._build_partition_projection) + + def _build_manifest_evaluator(self, spec_id: int) -> Callable[[ManifestFile], bool]: + schema = self._transaction.table_metadata.schema() + spec = self._transaction.table_metadata.specs()[spec_id] + return manifest_evaluator(spec, schema, self.partition_filters[spec_id], case_sensitive=True) + + def delete_by_predicate(self, predicate: BooleanExpression) -> None: + self._predicate = Or(self._predicate, predicate) + + @cached_property + def _compute_deletes(self) -> Tuple[List[ManifestFile], List[ManifestEntry], bool]: + """Computes all the delete operation and cache it when nothing changes. + + Returns: + - List of existing manifests that are not affected by the delete operation. + - The manifest-entries that are deleted based on the metadata. + - Flag indicating that rewrites of data-files are needed. + """ + schema = self._transaction.table_metadata.schema() + + def _copy_with_new_status(entry: ManifestEntry, status: ManifestEntryStatus) -> ManifestEntry: + return ManifestEntry( + status=status, + snapshot_id=entry.snapshot_id, + sequence_number=entry.sequence_number, + file_sequence_number=entry.file_sequence_number, + data_file=entry.data_file, + ) + + manifest_evaluators: Dict[int, Callable[[ManifestFile], bool]] = KeyDefaultDict(self._build_manifest_evaluator) + strict_metrics_evaluator = _StrictMetricsEvaluator(schema, self._predicate, case_sensitive=True).eval + inclusive_metrics_evaluator = _InclusiveMetricsEvaluator(schema, self._predicate, case_sensitive=True).eval + + existing_manifests = [] + total_deleted_entries = [] + partial_rewrites_needed = False + self._deleted_data_files = set() + if snapshot := self._transaction.table_metadata.current_snapshot(): + for manifest_file in snapshot.manifests(io=self._io): + if manifest_file.content == ManifestContent.DATA: + if not manifest_evaluators[manifest_file.partition_spec_id](manifest_file): + # If the manifest isn't relevant, we can just keep it in the manifest-list + existing_manifests.append(manifest_file) + else: + # It is relevant, let's check out the content + deleted_entries = [] + existing_entries = [] + for entry in manifest_file.fetch_manifest_entry(io=self._io, discard_deleted=True): + if strict_metrics_evaluator(entry.data_file) == ROWS_MUST_MATCH: + deleted_entries.append(_copy_with_new_status(entry, ManifestEntryStatus.DELETED)) + self._deleted_data_files.add(entry.data_file) + elif inclusive_metrics_evaluator(entry.data_file) == ROWS_CANNOT_MATCH: + existing_entries.append(_copy_with_new_status(entry, ManifestEntryStatus.EXISTING)) + else: + # Based on the metadata, it is unsure to say if the file can be deleted + partial_rewrites_needed = True + + if len(deleted_entries) > 0: + total_deleted_entries += deleted_entries + + # Rewrite the manifest + if len(existing_entries) > 0: + with write_manifest( + format_version=self._transaction.table_metadata.format_version, + spec=self._transaction.table_metadata.specs()[manifest_file.partition_spec_id], + schema=self._transaction.table_metadata.schema(), + output_file=self.new_manifest_output(), + snapshot_id=self._snapshot_id, + ) as writer: + for existing_entry in existing_entries: + writer.add_entry(existing_entry) + existing_manifests.append(writer.to_manifest_file()) + # else: + # deleted_manifests.append() + else: + existing_manifests.append(manifest_file) + else: + existing_manifests.append(manifest_file) + + return existing_manifests, total_deleted_entries, partial_rewrites_needed + + def _existing_manifests(self) -> List[ManifestFile]: + return self._compute_deletes[0] + + def _deleted_entries(self) -> List[ManifestEntry]: + return self._compute_deletes[1] + + @property + def rewrites_needed(self) -> bool: + """Indicate if data files need to be rewritten.""" + return self._compute_deletes[2] + + @property + def files_affected(self) -> bool: + """Indicate if any manifest-entries can be dropped.""" + return len(self._deleted_entries()) > 0 + + +class FastAppendFiles(_SnapshotProducer["FastAppendFiles"]): def _existing_manifests(self) -> List[ManifestFile]: """To determine if there are any existing manifest files. @@ -3152,14 +3411,97 @@ def _deleted_entries(self) -> List[ManifestEntry]: return [] -class OverwriteFiles(_MergingSnapshotProducer): - def _existing_manifests(self) -> List[ManifestFile]: - """To determine if there are any existing manifest files. +class MergeAppendFiles(FastAppendFiles): + _target_size_bytes: int + _min_count_to_merge: int + _merge_enabled: bool - In the of a full overwrite, all the previous manifests are - considered deleted. + def __init__( + self, + operation: Operation, + transaction: Transaction, + io: FileIO, + commit_uuid: Optional[uuid.UUID] = None, + snapshot_properties: Dict[str, str] = EMPTY_DICT, + ) -> None: + super().__init__(operation, transaction, io, commit_uuid, snapshot_properties) + self._target_size_bytes = PropertyUtil.property_as_int( + self._transaction.table_metadata.properties, + TableProperties.MANIFEST_TARGET_SIZE_BYTES, + TableProperties.MANIFEST_TARGET_SIZE_BYTES_DEFAULT, + ) # type: ignore + self._min_count_to_merge = PropertyUtil.property_as_int( + self._transaction.table_metadata.properties, + TableProperties.MANIFEST_MIN_MERGE_COUNT, + TableProperties.MANIFEST_MIN_MERGE_COUNT_DEFAULT, + ) # type: ignore + self._merge_enabled = PropertyUtil.property_as_bool( + self._transaction.table_metadata.properties, + TableProperties.MANIFEST_MERGE_ENABLED, + TableProperties.MANIFEST_MERGE_ENABLED_DEFAULT, + ) + + def _process_manifests(self, manifests: List[ManifestFile]) -> List[ManifestFile]: + """To perform any post-processing on the manifests before writing them to the new snapshot. + + In MergeAppendFiles, we merge manifests based on the target size and the minimum count to merge + if automatic merge is enabled. """ - return [] + unmerged_data_manifests = [manifest for manifest in manifests if manifest.content == ManifestContent.DATA] + unmerged_deletes_manifests = [manifest for manifest in manifests if manifest.content == ManifestContent.DELETES] + + data_manifest_merge_manager = _ManifestMergeManager( + target_size_bytes=self._target_size_bytes, + min_count_to_merge=self._min_count_to_merge, + merge_enabled=self._merge_enabled, + snapshot_producer=self, + ) + + return data_manifest_merge_manager.merge_manifests(unmerged_data_manifests) + unmerged_deletes_manifests + + +class OverwriteFiles(_SnapshotProducer["OverwriteFiles"]): + """Overwrites data from the table. This will produce an OVERWRITE snapshot. + + Data and delete files were added and removed in a logical overwrite operation. + """ + + def _existing_manifests(self) -> List[ManifestFile]: + """Determine if there are any existing manifest files.""" + existing_files = [] + + if snapshot := self._transaction.table_metadata.current_snapshot(): + for manifest_file in snapshot.manifests(io=self._io): + entries = manifest_file.fetch_manifest_entry(io=self._io, discard_deleted=True) + found_deleted_data_files = [entry.data_file for entry in entries if entry.data_file in self._deleted_data_files] + + if len(found_deleted_data_files) == 0: + existing_files.append(manifest_file) + else: + # We have to rewrite the manifest file without the deleted data files + if any(entry.data_file not in found_deleted_data_files for entry in entries): + with write_manifest( + format_version=self._transaction.table_metadata.format_version, + spec=self._transaction.table_metadata.spec(), + schema=self._transaction.table_metadata.schema(), + output_file=self.new_manifest_output(), + snapshot_id=self._snapshot_id, + ) as writer: + [ + writer.add_entry( + ManifestEntry( + status=ManifestEntryStatus.EXISTING, + snapshot_id=entry.snapshot_id, + sequence_number=entry.sequence_number, + file_sequence_number=entry.file_sequence_number, + data_file=entry.data_file, + ) + ) + for entry in entries + if entry.data_file not in found_deleted_data_files + ] + existing_files.append(writer.to_manifest_file()) + return existing_files def _deleted_entries(self) -> List[ManifestEntry]: """To determine if we need to record any deleted entries. @@ -3181,12 +3523,12 @@ def _get_entries(manifest: ManifestFile) -> List[ManifestEntry]: ManifestEntry( status=ManifestEntryStatus.DELETED, snapshot_id=entry.snapshot_id, - data_sequence_number=entry.data_sequence_number, + sequence_number=entry.sequence_number, file_sequence_number=entry.file_sequence_number, data_file=entry.data_file, ) for entry in manifest.fetch_manifest_entry(self._io, discard_deleted=True) - if entry.data_file.content == DataFileContent.DATA + if entry.data_file.content == DataFileContent.DATA and entry.data_file in self._deleted_data_files ] list_of_entries = executor.map(_get_entries, previous_snapshot.manifests(self._io)) @@ -3200,7 +3542,7 @@ class UpdateSnapshot: _io: FileIO _snapshot_properties: Dict[str, str] - def __init__(self, transaction: Transaction, io: FileIO, snapshot_properties: Dict[str, str]) -> None: + def __init__(self, transaction: Transaction, io: FileIO, snapshot_properties: Dict[str, str] = EMPTY_DICT) -> None: self._transaction = transaction self._io = io self._snapshot_properties = snapshot_properties @@ -3210,8 +3552,14 @@ def fast_append(self) -> FastAppendFiles: operation=Operation.APPEND, transaction=self._transaction, io=self._io, snapshot_properties=self._snapshot_properties ) - def overwrite(self) -> OverwriteFiles: + def merge_append(self) -> MergeAppendFiles: + return MergeAppendFiles( + operation=Operation.APPEND, transaction=self._transaction, io=self._io, snapshot_properties=self._snapshot_properties + ) + + def overwrite(self, commit_uuid: Optional[uuid.UUID] = None) -> OverwriteFiles: return OverwriteFiles( + commit_uuid=commit_uuid, operation=Operation.OVERWRITE if self._transaction.table_metadata.current_snapshot() is not None else Operation.APPEND, @@ -3220,6 +3568,14 @@ def overwrite(self) -> OverwriteFiles: snapshot_properties=self._snapshot_properties, ) + def delete(self) -> DeleteFiles: + return DeleteFiles( + operation=Operation.DELETE, + transaction=self._transaction, + io=self._io, + snapshot_properties=self._snapshot_properties, + ) + class UpdateSpec(UpdateTableMetadata["UpdateSpec"]): _transaction: Transaction @@ -3615,7 +3971,7 @@ def _readable_metrics_struct(bound_type: PrimitiveType) -> pa.StructType: entries.append({ "status": entry.status.value, "snapshot_id": entry.snapshot_id, - "sequence_number": entry.data_sequence_number, + "sequence_number": entry.sequence_number, "file_sequence_number": entry.file_sequence_number, "data_file": { "content": entry.data_file.content, @@ -4011,100 +4367,82 @@ def _readable_metrics_struct(bound_type: PrimitiveType) -> pa.StructType: ) -@dataclass(frozen=True) -class TablePartition: - partition_key: PartitionKey - arrow_table_partition: pa.Table - - -def _get_table_partitions( - arrow_table: pa.Table, - partition_spec: PartitionSpec, - schema: Schema, - slice_instructions: list[dict[str, Any]], -) -> list[TablePartition]: - sorted_slice_instructions = sorted(slice_instructions, key=lambda x: x["offset"]) - - partition_fields = partition_spec.fields - - offsets = [inst["offset"] for inst in sorted_slice_instructions] - projected_and_filtered = { - partition_field.source_id: arrow_table[schema.find_field(name_or_id=partition_field.source_id).name] - .take(offsets) - .to_pylist() - for partition_field in partition_fields - } +class _ManifestMergeManager(Generic[U]): + _target_size_bytes: int + _min_count_to_merge: int + _merge_enabled: bool + _snapshot_producer: _SnapshotProducer[U] - table_partitions = [] - for idx, inst in enumerate(sorted_slice_instructions): - partition_slice = arrow_table.slice(**inst) - fieldvalues = [ - PartitionFieldValue(partition_field, projected_and_filtered[partition_field.source_id][idx]) - for partition_field in partition_fields - ] - partition_key = PartitionKey(raw_partition_field_values=fieldvalues, partition_spec=partition_spec, schema=schema) - table_partitions.append(TablePartition(partition_key=partition_key, arrow_table_partition=partition_slice)) - return table_partitions - - -def _determine_partitions(spec: PartitionSpec, schema: Schema, arrow_table: pa.Table) -> List[TablePartition]: - """Based on the iceberg table partition spec, slice the arrow table into partitions with their keys. - - Example: - Input: - An arrow table with partition key of ['n_legs', 'year'] and with data of - {'year': [2020, 2022, 2022, 2021, 2022, 2022, 2022, 2019, 2021], - 'n_legs': [2, 2, 2, 4, 4, 4, 4, 5, 100], - 'animal': ["Flamingo", "Parrot", "Parrot", "Dog", "Horse", "Horse", "Horse","Brittle stars", "Centipede"]}. - The algrithm: - Firstly we group the rows into partitions by sorting with sort order [('n_legs', 'descending'), ('year', 'descending')] - and null_placement of "at_end". - This gives the same table as raw input. - Then we sort_indices using reverse order of [('n_legs', 'descending'), ('year', 'descending')] - and null_placement : "at_start". - This gives: - [8, 7, 4, 5, 6, 3, 1, 2, 0] - Based on this we get partition groups of indices: - [{'offset': 8, 'length': 1}, {'offset': 7, 'length': 1}, {'offset': 4, 'length': 3}, {'offset': 3, 'length': 1}, {'offset': 1, 'length': 2}, {'offset': 0, 'length': 1}] - We then retrieve the partition keys by offsets. - And slice the arrow table by offsets and lengths of each partition. - """ - import pyarrow as pa + def __init__( + self, target_size_bytes: int, min_count_to_merge: int, merge_enabled: bool, snapshot_producer: _SnapshotProducer[U] + ) -> None: + self._target_size_bytes = target_size_bytes + self._min_count_to_merge = min_count_to_merge + self._merge_enabled = merge_enabled + self._snapshot_producer = snapshot_producer + + def _group_by_spec(self, manifests: List[ManifestFile]) -> Dict[int, List[ManifestFile]]: + groups = defaultdict(list) + for manifest in manifests: + groups[manifest.partition_spec_id].append(manifest) + return groups + + def _create_manifest(self, spec_id: int, manifest_bin: List[ManifestFile]) -> ManifestFile: + with self._snapshot_producer.new_manifest_writer(spec=self._snapshot_producer.spec(spec_id)) as writer: + for manifest in manifest_bin: + for entry in self._snapshot_producer.fetch_manifest_entry(manifest=manifest, discard_deleted=False): + if entry.status == ManifestEntryStatus.DELETED and entry.snapshot_id == self._snapshot_producer.snapshot_id: + # only files deleted by this snapshot should be added to the new manifest + writer.delete(entry) + elif entry.status == ManifestEntryStatus.ADDED and entry.snapshot_id == self._snapshot_producer.snapshot_id: + # added entries from this snapshot are still added, otherwise they should be existing + writer.add(entry) + elif entry.status != ManifestEntryStatus.DELETED: + # add all non-deleted files from the old manifest as existing files + writer.existing(entry) + + return writer.to_manifest_file() + + def _merge_group(self, first_manifest: ManifestFile, spec_id: int, manifests: List[ManifestFile]) -> List[ManifestFile]: + packer: ListPacker[ManifestFile] = ListPacker(target_weight=self._target_size_bytes, lookback=1, largest_bin_first=False) + bins: List[List[ManifestFile]] = packer.pack_end(manifests, lambda m: m.manifest_length) + + def merge_bin(manifest_bin: List[ManifestFile]) -> List[ManifestFile]: + output_manifests = [] + if len(manifest_bin) == 1: + output_manifests.append(manifest_bin[0]) + elif first_manifest in manifest_bin and len(manifest_bin) < self._min_count_to_merge: + # if the bin has the first manifest (the new data files or an appended manifest file) then only + # merge it if the number of manifests is above the minimum count. this is applied only to bins + # with an in-memory manifest so that large manifests don't prevent merging older groups. + output_manifests.extend(manifest_bin) + else: + output_manifests.append(self._create_manifest(spec_id, manifest_bin)) - partition_columns: List[Tuple[PartitionField, NestedField]] = [ - (partition_field, schema.find_field(partition_field.source_id)) for partition_field in spec.fields - ] - partition_values_table = pa.table({ - str(partition.field_id): partition.transform.pyarrow_transform(field.field_type)(arrow_table[field.name]) - for partition, field in partition_columns - }) - - # Sort by partitions - sort_indices = pa.compute.sort_indices( - partition_values_table, - sort_keys=[(col, "ascending") for col in partition_values_table.column_names], - null_placement="at_end", - ).to_pylist() - arrow_table = arrow_table.take(sort_indices) - - # Get slice_instructions to group by partitions - partition_values_table = partition_values_table.take(sort_indices) - reversed_indices = pa.compute.sort_indices( - partition_values_table, - sort_keys=[(col, "descending") for col in partition_values_table.column_names], - null_placement="at_start", - ).to_pylist() - slice_instructions: List[Dict[str, Any]] = [] - last = len(reversed_indices) - reversed_indices_size = len(reversed_indices) - ptr = 0 - while ptr < reversed_indices_size: - group_size = last - reversed_indices[ptr] - offset = reversed_indices[ptr] - slice_instructions.append({"offset": offset, "length": group_size}) - last = reversed_indices[ptr] - ptr = ptr + group_size - - table_partitions: List[TablePartition] = _get_table_partitions(arrow_table, spec, schema, slice_instructions) - - return table_partitions + return output_manifests + + executor = ExecutorFactory.get_or_create() + futures = [executor.submit(merge_bin, b) for b in bins] + + # for consistent ordering, we need to maintain future order + futures_index = {f: i for i, f in enumerate(futures)} + completed_futures: SortedList[Future[List[ManifestFile]]] = SortedList(iterable=[], key=lambda f: futures_index[f]) + for future in concurrent.futures.as_completed(futures): + completed_futures.add(future) + + bin_results: List[List[ManifestFile]] = [f.result() for f in completed_futures if f.result()] + + return [manifest for bin_result in bin_results for manifest in bin_result] + + def merge_manifests(self, manifests: List[ManifestFile]) -> List[ManifestFile]: + if not self._merge_enabled or len(manifests) == 0: + return manifests + + first_manifest = manifests[0] + groups = self._group_by_spec(manifests) + + merged_manifests = [] + for spec_id in reversed(groups.keys()): + merged_manifests.extend(self._merge_group(first_manifest, spec_id, groups[spec_id])) + + return merged_manifests diff --git a/pyiceberg/table/name_mapping.py b/pyiceberg/table/name_mapping.py index 5a4e769003..cb9f72bf97 100644 --- a/pyiceberg/table/name_mapping.py +++ b/pyiceberg/table/name_mapping.py @@ -37,7 +37,7 @@ class MappedField(IcebergBaseModel): field_id: int = Field(alias="field-id") - names: List[str] = conlist(str, min_length=1) + names: List[str] = conlist(str) fields: List[MappedField] = Field(default_factory=list) @field_validator("fields", mode="before") @@ -45,18 +45,6 @@ class MappedField(IcebergBaseModel): def convert_null_to_empty_List(cls, v: Any) -> Any: return v or [] - @field_validator("names", mode="after") - @classmethod - def check_at_least_one(cls, v: List[str]) -> Any: - """ - Conlist constraint does not seem to be validating the class on instantiation. - - Adding a custom validator to enforce min_length=1 constraint. - """ - if len(v) < 1: - raise ValueError("At least one mapped name must be provided for the field") - return v - @model_serializer def ser_model(self) -> Dict[str, Any]: """Set custom serializer to leave out the field when it is empty.""" diff --git a/pyiceberg/table/snapshots.py b/pyiceberg/table/snapshots.py index 842d42522a..1ccb079922 100644 --- a/pyiceberg/table/snapshots.py +++ b/pyiceberg/table/snapshots.py @@ -352,7 +352,7 @@ def get_prop(prop: str) -> int: def update_snapshot_summaries( summary: Summary, previous_summary: Optional[Mapping[str, str]] = None, truncate_full_table: bool = False ) -> Summary: - if summary.operation not in {Operation.APPEND, Operation.OVERWRITE}: + if summary.operation not in {Operation.APPEND, Operation.OVERWRITE, Operation.DELETE}: raise ValueError(f"Operation not implemented: {summary.operation}") if truncate_full_table and summary.operation == Operation.OVERWRITE and previous_summary is not None: diff --git a/pyiceberg/types.py b/pyiceberg/types.py index cd662c7387..97ddea0e57 100644 --- a/pyiceberg/types.py +++ b/pyiceberg/types.py @@ -67,7 +67,7 @@ def transform_dict_value_to_str(dict: Dict[str, Any]) -> Dict[str, str]: for key, value in dict.items(): if value is None: raise ValueError(f"None type is not a supported value in properties: {key}") - return {k: str(v) for k, v in dict.items()} + return {k: str(v).lower() if isinstance(v, bool) else str(v) for k, v in dict.items()} def _parse_decimal_type(decimal: Any) -> Tuple[int, int]: diff --git a/pyiceberg/utils/bin_packing.py b/pyiceberg/utils/bin_packing.py index ddebde13e2..0291619685 100644 --- a/pyiceberg/utils/bin_packing.py +++ b/pyiceberg/utils/bin_packing.py @@ -104,3 +104,29 @@ def remove_bin(self) -> Bin[T]: return bin_ else: return self.bins.pop(0) + + +class ListPacker(Generic[T]): + _target_weight: int + _lookback: int + _largest_bin_first: bool + + def __init__(self, target_weight: int, lookback: int, largest_bin_first: bool) -> None: + self._target_weight = target_weight + self._lookback = lookback + self._largest_bin_first = largest_bin_first + + def pack(self, items: List[T], weight_func: Callable[[T], int]) -> List[List[T]]: + return list( + PackingIterator( + items=items, + target_weight=self._target_weight, + lookback=self._lookback, + weight_func=weight_func, + largest_bin_first=self._largest_bin_first, + ) + ) + + def pack_end(self, items: List[T], weight_func: Callable[[T], int]) -> List[List[T]]: + packed = self.pack(items=list(reversed(items)), weight_func=weight_func) + return [list(reversed(bin_items)) for bin_items in reversed(packed)] diff --git a/pyproject.toml b/pyproject.toml index fe8fe4ed0a..c8cfff5093 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,7 +78,7 @@ pytest = "7.4.4" pytest-checkdocs = "2.10.1" pytest-lazy-fixture = "0.6.3" pre-commit = "3.5.0" -fastavro = "1.9.4" +fastavro = "1.9.5" coverage = { version = "^7.4.2", extras = ["toml"] } requests-mock = "1.12.1" moto = { version = "^5.0.2", extras = ["server"] } diff --git a/tests/avro/test_file.py b/tests/avro/test_file.py index 4df132304c..981aab2547 100644 --- a/tests/avro/test_file.py +++ b/tests/avro/test_file.py @@ -140,7 +140,7 @@ def test_write_manifest_entry_with_iceberg_read_with_fastavro_v1() -> None: entry = ManifestEntry( status=ManifestEntryStatus.ADDED, snapshot_id=8638475580105682862, - data_sequence_number=0, + sequence_number=0, file_sequence_number=0, data_file=data_file, ) @@ -173,7 +173,7 @@ def test_write_manifest_entry_with_iceberg_read_with_fastavro_v1() -> None: v2_entry = todict(entry) # These are not written in V1 - del v2_entry["data_sequence_number"] + del v2_entry["sequence_number"] del v2_entry["file_sequence_number"] del v2_entry["data_file"]["content"] del v2_entry["data_file"]["equality_ids"] @@ -206,7 +206,7 @@ def test_write_manifest_entry_with_iceberg_read_with_fastavro_v2() -> None: entry = ManifestEntry( status=ManifestEntryStatus.ADDED, snapshot_id=8638475580105682862, - data_sequence_number=0, + sequence_number=0, file_sequence_number=0, data_file=data_file, ) @@ -263,7 +263,7 @@ def test_write_manifest_entry_with_fastavro_read_with_iceberg(format_version: in entry = ManifestEntry( status=ManifestEntryStatus.ADDED, snapshot_id=8638475580105682862, - data_sequence_number=0, + sequence_number=0, file_sequence_number=0, data_file=data_file, ) @@ -305,7 +305,7 @@ def test_write_manifest_entry_with_fastavro_read_with_iceberg(format_version: in status=ManifestEntryStatus.ADDED, snapshot_id=8638475580105682862, # Not part of v1 - data_sequence_number=None, + sequence_number=None, file_sequence_number=None, data_file=v1_datafile, ) diff --git a/tests/catalog/integration_test_glue.py b/tests/catalog/integration_test_glue.py index 21c415212a..a5293e38f2 100644 --- a/tests/catalog/integration_test_glue.py +++ b/tests/catalog/integration_test_glue.py @@ -25,7 +25,7 @@ from botocore.exceptions import ClientError from pyiceberg.catalog import Catalog, MetastoreCatalog -from pyiceberg.catalog.glue import GlueCatalog +from pyiceberg.catalog.glue import GLUE_CATALOG_ENDPOINT, GlueCatalog from pyiceberg.exceptions import ( NamespaceAlreadyExistsError, NamespaceNotEmptyError, @@ -33,11 +33,10 @@ NoSuchTableError, TableAlreadyExistsError, ) -from pyiceberg.io.pyarrow import schema_to_pyarrow +from pyiceberg.io.pyarrow import _dataframe_to_data_files, schema_to_pyarrow from pyiceberg.schema import Schema -from pyiceberg.table import _dataframe_to_data_files from pyiceberg.types import IntegerType -from tests.conftest import clean_up, get_bucket_name, get_s3_path +from tests.conftest import clean_up, get_bucket_name, get_glue_endpoint, get_s3_path # The number of tables/databases used in list_table/namespace test LIST_TEST_NUMBER = 2 @@ -52,7 +51,9 @@ def fixture_glue_client() -> boto3.client: @pytest.fixture(name="test_catalog", scope="module") def fixture_test_catalog() -> Generator[Catalog, None, None]: """Configure the pre- and post-setting of aws integration test.""" - test_catalog = GlueCatalog(CATALOG_NAME, warehouse=get_s3_path(get_bucket_name())) + test_catalog = GlueCatalog( + CATALOG_NAME, **{"warehouse": get_s3_path(get_bucket_name()), GLUE_CATALOG_ENDPOINT: get_glue_endpoint()} + ) yield test_catalog clean_up(test_catalog) diff --git a/tests/catalog/test_glue.py b/tests/catalog/test_glue.py index 6b57f1dfe6..c4afa50c52 100644 --- a/tests/catalog/test_glue.py +++ b/tests/catalog/test_glue.py @@ -862,3 +862,13 @@ def test_register_table_with_given_location( table = test_catalog.register_table(identifier, location) assert table.identifier == (catalog_name,) + identifier assert test_catalog.table_exists(identifier) is True + + +@mock_aws +def test_glue_endpoint_override(_bucket_initialize: None, moto_endpoint_url: str, database_name: str) -> None: + catalog_name = "glue" + test_endpoint = "https://test-endpoint" + test_catalog = GlueCatalog( + catalog_name, **{"s3.endpoint": moto_endpoint_url, "warehouse": f"s3://{BUCKET_NAME}", "glue.endpoint": test_endpoint} + ) + assert test_catalog.glue.meta.endpoint_url == test_endpoint diff --git a/tests/catalog/test_sql.py b/tests/catalog/test_sql.py index bb8fcea3ae..f887b1ea3b 100644 --- a/tests/catalog/test_sql.py +++ b/tests/catalog/test_sql.py @@ -39,10 +39,9 @@ TableAlreadyExistsError, ) from pyiceberg.io import FSSPEC_FILE_IO, PY_IO_IMPL -from pyiceberg.io.pyarrow import schema_to_pyarrow +from pyiceberg.io.pyarrow import _dataframe_to_data_files, schema_to_pyarrow from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC from pyiceberg.schema import Schema -from pyiceberg.table import _dataframe_to_data_files from pyiceberg.table.snapshots import Operation from pyiceberg.table.sorting import ( NullOrder, @@ -1550,3 +1549,35 @@ def test_table_exists(catalog: SqlCatalog, table_schema_simple: Schema, table_id # Act and Assert for a non-existing table assert catalog.table_exists(("non", "exist")) is False + + +@pytest.mark.parametrize( + "catalog", + [ + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), + ], +) +@pytest.mark.parametrize("format_version", [1, 2]) +def test_merge_manifests_local_file_system(catalog: SqlCatalog, arrow_table_with_null: pa.Table, format_version: int) -> None: + # To catch manifest file name collision bug during merge: + # https://github.com/apache/iceberg-python/pull/363#discussion_r1660691918 + catalog.create_namespace_if_not_exists("default") + try: + catalog.drop_table("default.test_merge_manifest") + except NoSuchTableError: + pass + tbl = catalog.create_table( + "default.test_merge_manifest", + arrow_table_with_null.schema, + properties={ + "commit.manifest-merge.enabled": "true", + "commit.manifest.min-count-to-merge": "2", + "format-version": format_version, + }, + ) + + for _ in range(5): + tbl.append(arrow_table_with_null) + + assert len(tbl.scan().to_arrow()) == 5 * len(arrow_table_with_null) diff --git a/tests/cli/test_console.py b/tests/cli/test_console.py index 92a7f80c7d..e55ff9a9ad 100644 --- a/tests/cli/test_console.py +++ b/tests/cli/test_console.py @@ -83,7 +83,7 @@ def mock_datetime_now(monkeypatch: pytest.MonkeyPatch) -> None: NestedField(3, "z", LongType(), required=True), ) TEST_TABLE_PARTITION_SPEC = PartitionSpec(PartitionField(name="x", transform=IdentityTransform(), source_id=1, field_id=1000)) -TEST_TABLE_PROPERTIES = {"read.split.target.size": "134217728"} +TEST_TABLE_PROPERTIES = {"read.split.target.size": "134217728", "write.parquet.bloom-filter-enabled.column.x": True} TEST_TABLE_UUID = uuid.UUID("d20125c8-7284-442c-9aea-15fee620737c") TEST_TIMESTAMP = 1602638573874 MOCK_ENVIRONMENT = {"PYICEBERG_CATALOG__PRODUCTION__URI": "test://doesnotexist"} @@ -367,7 +367,10 @@ def test_properties_get_table(catalog: InMemoryCatalog) -> None: runner = CliRunner() result = runner.invoke(run, ["properties", "get", "table", "default.my_table"]) assert result.exit_code == 0 - assert result.output == "read.split.target.size 134217728\n" + assert ( + result.output + == "read.split.target.size 134217728\nwrite.parquet.bloom-filter-enabled.column.x true \n" + ) def test_properties_get_table_specific_property(catalog: InMemoryCatalog) -> None: @@ -763,7 +766,7 @@ def test_json_properties_get_table(catalog: InMemoryCatalog) -> None: runner = CliRunner() result = runner.invoke(run, ["--output=json", "properties", "get", "table", "default.my_table"]) assert result.exit_code == 0 - assert result.output == """{"read.split.target.size": "134217728"}\n""" + assert result.output == """{"read.split.target.size": "134217728", "write.parquet.bloom-filter-enabled.column.x": "true"}\n""" def test_json_properties_get_table_specific_property(catalog: InMemoryCatalog) -> None: diff --git a/tests/conftest.py b/tests/conftest.py index d200f3ab3c..7f9a2bcfa8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2043,6 +2043,11 @@ def get_bucket_name() -> str: return bucket_name +def get_glue_endpoint() -> Optional[str]: + """Set the optional environment variable AWS_TEST_GLUE_ENDPOINT for a glue endpoint to test.""" + return os.getenv("AWS_TEST_GLUE_ENDPOINT") + + def get_s3_path(bucket_name: str, database_name: Optional[str] = None, table_name: Optional[str] = None) -> str: result_path = f"s3://{bucket_name}" if database_name is not None: @@ -2299,7 +2304,37 @@ def arrow_table_with_null(pa_schema: "pa.Schema") -> "pa.Table": """Pyarrow table with all kinds of columns.""" import pyarrow as pa - return pa.Table.from_pydict(TEST_DATA_WITH_NULL, schema=pa_schema) + return pa.Table.from_pydict( + { + "bool": [False, None, True], + "string": ["a", None, "z"], + # Go over the 16 bytes to kick in truncation + "string_long": ["a" * 22, None, "z" * 22], + "int": [1, None, 9], + "long": [1, None, 9], + "float": [0.0, None, 0.9], + "double": [0.0, None, 0.9], + # 'time': [1_000_000, None, 3_000_000], # Example times: 1s, none, and 3s past midnight #Spark does not support time fields + "timestamp": [datetime(2023, 1, 1, 19, 25, 00), None, datetime(2023, 3, 1, 19, 25, 00)], + "timestamptz": [ + datetime(2023, 1, 1, 19, 25, 00, tzinfo=timezone.utc), + None, + datetime(2023, 3, 1, 19, 25, 00, tzinfo=timezone.utc), + ], + "date": [date(2023, 1, 1), None, date(2023, 3, 1)], + # Not supported by Spark + # 'time': [time(1, 22, 0), None, time(19, 25, 0)], + # Not natively supported by Arrow + # 'uuid': [uuid.UUID('00000000-0000-0000-0000-000000000000').bytes, None, uuid.UUID('11111111-1111-1111-1111-111111111111').bytes], + "binary": [b"\01", None, b"\22"], + "fixed": [ + uuid.UUID("00000000-0000-0000-0000-000000000000").bytes, + None, + uuid.UUID("11111111-1111-1111-1111-111111111111").bytes, + ], + }, + schema=pa_schema, + ) @pytest.fixture(scope="session") @@ -2352,10 +2387,181 @@ def arrow_table_date_timestamps() -> "pa.Table": @pytest.fixture(scope="session") -def arrow_table_date_timestamps_schema() -> Schema: - """Pyarrow table Schema with only date, timestamp and timestamptz values.""" +def table_date_timestamps_schema() -> Schema: + """Iceberg table Schema with only date, timestamp and timestamptz values.""" return Schema( NestedField(field_id=1, name="date", field_type=DateType(), required=False), NestedField(field_id=2, name="timestamp", field_type=TimestampType(), required=False), NestedField(field_id=3, name="timestamptz", field_type=TimestamptzType(), required=False), ) + + +@pytest.fixture(scope="session") +def arrow_table_schema_with_all_timestamp_precisions() -> "pa.Schema": + """Pyarrow Schema with all supported timestamp types.""" + import pyarrow as pa + + return pa.schema([ + ("timestamp_s", pa.timestamp(unit="s")), + ("timestamptz_s", pa.timestamp(unit="s", tz="UTC")), + ("timestamp_ms", pa.timestamp(unit="ms")), + ("timestamptz_ms", pa.timestamp(unit="ms", tz="UTC")), + ("timestamp_us", pa.timestamp(unit="us")), + ("timestamptz_us", pa.timestamp(unit="us", tz="UTC")), + ("timestamp_ns", pa.timestamp(unit="ns")), + ("timestamptz_ns", pa.timestamp(unit="ns", tz="UTC")), + ("timestamptz_us_etc_utc", pa.timestamp(unit="us", tz="Etc/UTC")), + ("timestamptz_ns_z", pa.timestamp(unit="ns", tz="Z")), + ("timestamptz_s_0000", pa.timestamp(unit="s", tz="+00:00")), + ]) + + +@pytest.fixture(scope="session") +def arrow_table_with_all_timestamp_precisions(arrow_table_schema_with_all_timestamp_precisions: "pa.Schema") -> "pa.Table": + """Pyarrow table with all supported timestamp types.""" + import pandas as pd + import pyarrow as pa + + test_data = pd.DataFrame({ + "timestamp_s": [datetime(2023, 1, 1, 19, 25, 00), None, datetime(2023, 3, 1, 19, 25, 00)], + "timestamptz_s": [ + datetime(2023, 1, 1, 19, 25, 00, tzinfo=timezone.utc), + None, + datetime(2023, 3, 1, 19, 25, 00, tzinfo=timezone.utc), + ], + "timestamp_ms": [datetime(2023, 1, 1, 19, 25, 00), None, datetime(2023, 3, 1, 19, 25, 00)], + "timestamptz_ms": [ + datetime(2023, 1, 1, 19, 25, 00, tzinfo=timezone.utc), + None, + datetime(2023, 3, 1, 19, 25, 00, tzinfo=timezone.utc), + ], + "timestamp_us": [datetime(2023, 1, 1, 19, 25, 00), None, datetime(2023, 3, 1, 19, 25, 00)], + "timestamptz_us": [ + datetime(2023, 1, 1, 19, 25, 00, tzinfo=timezone.utc), + None, + datetime(2023, 3, 1, 19, 25, 00, tzinfo=timezone.utc), + ], + "timestamp_ns": [ + pd.Timestamp(year=2024, month=7, day=11, hour=3, minute=30, second=0, microsecond=12, nanosecond=6), + None, + pd.Timestamp(year=2024, month=7, day=11, hour=3, minute=30, second=0, microsecond=12, nanosecond=7), + ], + "timestamptz_ns": [ + datetime(2023, 1, 1, 19, 25, 00, tzinfo=timezone.utc), + None, + datetime(2023, 3, 1, 19, 25, 00, tzinfo=timezone.utc), + ], + "timestamptz_us_etc_utc": [ + datetime(2023, 1, 1, 19, 25, 00, tzinfo=timezone.utc), + None, + datetime(2023, 3, 1, 19, 25, 00, tzinfo=timezone.utc), + ], + "timestamptz_ns_z": [ + pd.Timestamp(year=2024, month=7, day=11, hour=3, minute=30, second=0, microsecond=12, nanosecond=6, tz="UTC"), + None, + pd.Timestamp(year=2024, month=7, day=11, hour=3, minute=30, second=0, microsecond=12, nanosecond=7, tz="UTC"), + ], + "timestamptz_s_0000": [ + datetime(2023, 1, 1, 19, 25, 1, tzinfo=timezone.utc), + None, + datetime(2023, 3, 1, 19, 25, 1, tzinfo=timezone.utc), + ], + }) + return pa.Table.from_pandas(test_data, schema=arrow_table_schema_with_all_timestamp_precisions) + + +@pytest.fixture(scope="session") +def arrow_table_schema_with_all_microseconds_timestamp_precisions() -> "pa.Schema": + """Pyarrow Schema with all microseconds timestamp.""" + import pyarrow as pa + + return pa.schema([ + ("timestamp_s", pa.timestamp(unit="us")), + ("timestamptz_s", pa.timestamp(unit="us", tz="UTC")), + ("timestamp_ms", pa.timestamp(unit="us")), + ("timestamptz_ms", pa.timestamp(unit="us", tz="UTC")), + ("timestamp_us", pa.timestamp(unit="us")), + ("timestamptz_us", pa.timestamp(unit="us", tz="UTC")), + ("timestamp_ns", pa.timestamp(unit="us")), + ("timestamptz_ns", pa.timestamp(unit="us", tz="UTC")), + ("timestamptz_us_etc_utc", pa.timestamp(unit="us", tz="UTC")), + ("timestamptz_ns_z", pa.timestamp(unit="us", tz="UTC")), + ("timestamptz_s_0000", pa.timestamp(unit="us", tz="UTC")), + ]) + + +@pytest.fixture(scope="session") +def table_schema_with_all_microseconds_timestamp_precision() -> Schema: + """Iceberg table Schema with only date, timestamp and timestamptz values.""" + return Schema( + NestedField(field_id=1, name="timestamp_s", field_type=TimestampType(), required=False), + NestedField(field_id=2, name="timestamptz_s", field_type=TimestamptzType(), required=False), + NestedField(field_id=3, name="timestamp_ms", field_type=TimestampType(), required=False), + NestedField(field_id=4, name="timestamptz_ms", field_type=TimestamptzType(), required=False), + NestedField(field_id=5, name="timestamp_us", field_type=TimestampType(), required=False), + NestedField(field_id=6, name="timestamptz_us", field_type=TimestamptzType(), required=False), + NestedField(field_id=7, name="timestamp_ns", field_type=TimestampType(), required=False), + NestedField(field_id=8, name="timestamptz_ns", field_type=TimestamptzType(), required=False), + NestedField(field_id=9, name="timestamptz_us_etc_utc", field_type=TimestamptzType(), required=False), + NestedField(field_id=10, name="timestamptz_ns_z", field_type=TimestamptzType(), required=False), + NestedField(field_id=11, name="timestamptz_s_0000", field_type=TimestamptzType(), required=False), + ) + + +@pytest.fixture(scope="session") +def table_schema_with_promoted_types() -> Schema: + """Iceberg table Schema with longs, doubles and uuid in simple and nested types.""" + return Schema( + NestedField(field_id=1, name="long", field_type=LongType(), required=False), + NestedField( + field_id=2, + name="list", + field_type=ListType(element_id=4, element_type=LongType(), element_required=False), + required=True, + ), + NestedField( + field_id=3, + name="map", + field_type=MapType( + key_id=5, + key_type=StringType(), + value_id=6, + value_type=LongType(), + value_required=False, + ), + required=True, + ), + NestedField(field_id=7, name="double", field_type=DoubleType(), required=False), + NestedField(field_id=8, name="uuid", field_type=UUIDType(), required=False), + ) + + +@pytest.fixture(scope="session") +def pyarrow_schema_with_promoted_types() -> "pa.Schema": + """Pyarrow Schema with longs, doubles and uuid in simple and nested types.""" + import pyarrow as pa + + return pa.schema(( + pa.field("long", pa.int32(), nullable=True), # can support upcasting integer to long + pa.field("list", pa.list_(pa.int32()), nullable=False), # can support upcasting integer to long + pa.field("map", pa.map_(pa.string(), pa.int32()), nullable=False), # can support upcasting integer to long + pa.field("double", pa.float32(), nullable=True), # can support upcasting float to double + pa.field("uuid", pa.binary(length=16), nullable=True), # can support upcasting float to double + )) + + +@pytest.fixture(scope="session") +def pyarrow_table_with_promoted_types(pyarrow_schema_with_promoted_types: "pa.Schema") -> "pa.Table": + """Pyarrow table with longs, doubles and uuid in simple and nested types.""" + import pyarrow as pa + + return pa.Table.from_pydict( + { + "long": [1, 9], + "list": [[1, 1], [2, 2]], + "map": [{"a": 1}, {"b": 2}], + "double": [1.1, 9.2], + "uuid": [b"qZx\xefNS@\x89\x9b\xf9:\xd0\xee\x9b\xf5E", b"\x97]\x87T^JDJ\x96\x97\xf4v\xe4\x03\x0c\xde"], + }, + schema=pyarrow_schema_with_promoted_types, + ) diff --git a/tests/integration/test_add_files.py b/tests/integration/test_add_files.py index 84729fcca4..3703a9e0b6 100644 --- a/tests/integration/test_add_files.py +++ b/tests/integration/test_add_files.py @@ -16,17 +16,22 @@ # under the License. # pylint:disable=redefined-outer-name +import os +import re from datetime import date -from typing import Iterator, Optional +from typing import Iterator import pyarrow as pa import pyarrow.parquet as pq import pytest from pyspark.sql import SparkSession +from pytest_mock.plugin import MockerFixture from pyiceberg.catalog import Catalog from pyiceberg.exceptions import NoSuchTableError -from pyiceberg.partitioning import PartitionField, PartitionSpec +from pyiceberg.io import FileIO +from pyiceberg.io.pyarrow import _pyarrow_schema_ensure_large_types +from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionField, PartitionSpec from pyiceberg.schema import Schema from pyiceberg.table import Table from pyiceberg.transforms import BucketTransform, IdentityTransform, MonthTransform @@ -34,8 +39,10 @@ BooleanType, DateType, IntegerType, + LongType, NestedField, StringType, + TimestamptzType, ) TABLE_SCHEMA = Schema( @@ -104,23 +111,32 @@ ) +def _write_parquet(io: FileIO, file_path: str, arrow_schema: pa.Schema, arrow_table: pa.Table) -> None: + fo = io.new_output(file_path) + with fo.create(overwrite=True) as fos: + with pq.ParquetWriter(fos, schema=arrow_schema) as writer: + writer.write_table(arrow_table) + + def _create_table( - session_catalog: Catalog, identifier: str, format_version: int, partition_spec: Optional[PartitionSpec] = None + session_catalog: Catalog, + identifier: str, + format_version: int, + partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC, + schema: Schema = TABLE_SCHEMA, ) -> Table: try: session_catalog.drop_table(identifier=identifier) except NoSuchTableError: pass - tbl = session_catalog.create_table( + return session_catalog.create_table( identifier=identifier, - schema=TABLE_SCHEMA, + schema=schema, properties={"format-version": str(format_version)}, - partition_spec=partition_spec if partition_spec else PartitionSpec(), + partition_spec=partition_spec, ) - return tbl - @pytest.fixture(name="format_version", params=[pytest.param(1, id="format_version=1"), pytest.param(2, id="format_version=2")]) def format_version_fixure(request: pytest.FixtureRequest) -> Iterator[int]: @@ -448,3 +464,271 @@ def test_add_files_snapshot_properties(spark: SparkSession, session_catalog: Cat assert "snapshot_prop_a" in summary assert summary["snapshot_prop_a"] == "test_prop_a" + + +@pytest.mark.integration +def test_add_files_fails_on_schema_mismatch(spark: SparkSession, session_catalog: Catalog, format_version: int) -> None: + identifier = f"default.table_schema_mismatch_fails_v{format_version}" + + tbl = _create_table(session_catalog, identifier, format_version) + WRONG_SCHEMA = pa.schema([ + ("foo", pa.bool_()), + ("bar", pa.string()), + ("baz", pa.string()), # should be integer + ("qux", pa.date32()), + ]) + file_path = f"s3://warehouse/default/table_schema_mismatch_fails/v{format_version}/test.parquet" + # write parquet files + fo = tbl.io.new_output(file_path) + with fo.create(overwrite=True) as fos: + with pq.ParquetWriter(fos, schema=WRONG_SCHEMA) as writer: + writer.write_table( + pa.Table.from_pylist( + [ + { + "foo": True, + "bar": "bar_string", + "baz": "123", + "qux": date(2024, 3, 7), + }, + { + "foo": True, + "bar": "bar_string", + "baz": "124", + "qux": date(2024, 3, 7), + }, + ], + schema=WRONG_SCHEMA, + ) + ) + + expected = """Mismatch in fields: +┏━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +┃ ┃ Table field ┃ Dataframe field ┃ +┡━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━┩ +│ ✅ │ 1: foo: optional boolean │ 1: foo: optional boolean │ +│ ✅ │ 2: bar: optional string │ 2: bar: optional string │ +│ ❌ │ 3: baz: optional int │ 3: baz: optional string │ +│ ✅ │ 4: qux: optional date │ 4: qux: optional date │ +└────┴──────────────────────────┴──────────────────────────┘ +""" + + with pytest.raises(ValueError, match=expected): + tbl.add_files(file_paths=[file_path]) + + +@pytest.mark.integration +def test_add_files_with_large_and_regular_schema(spark: SparkSession, session_catalog: Catalog, format_version: int) -> None: + identifier = f"default.unpartitioned_with_large_types{format_version}" + + iceberg_schema = Schema(NestedField(1, "foo", StringType(), required=True)) + arrow_schema = pa.schema([ + pa.field("foo", pa.string(), nullable=False), + ]) + arrow_schema_large = pa.schema([ + pa.field("foo", pa.large_string(), nullable=False), + ]) + + tbl = _create_table(session_catalog, identifier, format_version, schema=iceberg_schema) + + file_path = f"s3://warehouse/default/unpartitioned_with_large_types/v{format_version}/test-0.parquet" + _write_parquet( + tbl.io, + file_path, + arrow_schema, + pa.Table.from_pylist( + [ + { + "foo": "normal", + } + ], + schema=arrow_schema, + ), + ) + + tbl.add_files([file_path]) + + table_schema = tbl.scan().to_arrow().schema + assert table_schema == arrow_schema_large + + file_path_large = f"s3://warehouse/default/unpartitioned_with_large_types/v{format_version}/test-1.parquet" + _write_parquet( + tbl.io, + file_path_large, + arrow_schema_large, + pa.Table.from_pylist( + [ + { + "foo": "normal", + } + ], + schema=arrow_schema_large, + ), + ) + + tbl.add_files([file_path_large]) + + table_schema = tbl.scan().to_arrow().schema + assert table_schema == arrow_schema_large + + +@pytest.mark.integration +def test_add_files_with_timestamp_tz_ns_fails(session_catalog: Catalog, format_version: int, mocker: MockerFixture) -> None: + nanoseconds_schema_iceberg = Schema(NestedField(1, "quux", TimestamptzType())) + + nanoseconds_schema = pa.schema([ + ("quux", pa.timestamp("ns", tz="UTC")), + ]) + + arrow_table = pa.Table.from_pylist( + [ + { + "quux": 1615967687249846175, # 2021-03-17 07:54:47.249846159 + } + ], + schema=nanoseconds_schema, + ) + mocker.patch.dict(os.environ, values={"PYICEBERG_DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE": "True"}) + + identifier = f"default.timestamptz_ns_added{format_version}" + tbl = _create_table(session_catalog, identifier, format_version, schema=nanoseconds_schema_iceberg) + + file_path = f"s3://warehouse/default/test_timestamp_tz/v{format_version}/test.parquet" + # write parquet files + fo = tbl.io.new_output(file_path) + with fo.create(overwrite=True) as fos: + with pq.ParquetWriter(fos, schema=nanoseconds_schema) as writer: + writer.write_table(arrow_table) + + # add the parquet files as data files + with pytest.raises( + TypeError, + match=re.escape( + "Iceberg does not yet support 'ns' timestamp precision. Use 'downcast-ns-timestamp-to-us-on-write' configuration property to automatically downcast 'ns' to 'us' on write." + ), + ): + tbl.add_files(file_paths=[file_path]) + + +@pytest.mark.integration +@pytest.mark.parametrize("format_version", [1, 2]) +def test_add_file_with_valid_nullability_diff(spark: SparkSession, session_catalog: Catalog, format_version: int) -> None: + identifier = f"default.test_table_with_valid_nullability_diff{format_version}" + table_schema = Schema( + NestedField(field_id=1, name="long", field_type=LongType(), required=False), + ) + other_schema = pa.schema(( + pa.field("long", pa.int64(), nullable=False), # can support writing required pyarrow field to optional Iceberg field + )) + arrow_table = pa.Table.from_pydict( + { + "long": [1, 9], + }, + schema=other_schema, + ) + tbl = _create_table(session_catalog, identifier, format_version, schema=table_schema) + + file_path = f"s3://warehouse/default/test_add_file_with_valid_nullability_diff/v{format_version}/test.parquet" + # write parquet files + fo = tbl.io.new_output(file_path) + with fo.create(overwrite=True) as fos: + with pq.ParquetWriter(fos, schema=other_schema) as writer: + writer.write_table(arrow_table) + + tbl.add_files(file_paths=[file_path]) + # table's long field should cast to be optional on read + written_arrow_table = tbl.scan().to_arrow() + assert written_arrow_table == arrow_table.cast(pa.schema((pa.field("long", pa.int64(), nullable=True),))) + lhs = spark.table(f"{identifier}").toPandas() + rhs = written_arrow_table.to_pandas() + + for column in written_arrow_table.column_names: + for left, right in zip(lhs[column].to_list(), rhs[column].to_list()): + assert left == right + + +@pytest.mark.integration +@pytest.mark.parametrize("format_version", [1, 2]) +def test_add_files_with_valid_upcast( + spark: SparkSession, + session_catalog: Catalog, + format_version: int, + table_schema_with_promoted_types: Schema, + pyarrow_schema_with_promoted_types: pa.Schema, + pyarrow_table_with_promoted_types: pa.Table, +) -> None: + identifier = f"default.test_table_with_valid_upcast{format_version}" + tbl = _create_table(session_catalog, identifier, format_version, schema=table_schema_with_promoted_types) + + file_path = f"s3://warehouse/default/test_add_files_with_valid_upcast/v{format_version}/test.parquet" + # write parquet files + fo = tbl.io.new_output(file_path) + with fo.create(overwrite=True) as fos: + with pq.ParquetWriter(fos, schema=pyarrow_schema_with_promoted_types) as writer: + writer.write_table(pyarrow_table_with_promoted_types) + + tbl.add_files(file_paths=[file_path]) + # table's long field should cast to long on read + written_arrow_table = tbl.scan().to_arrow() + assert written_arrow_table == pyarrow_table_with_promoted_types.cast( + pa.schema(( + pa.field("long", pa.int64(), nullable=True), + pa.field("list", pa.large_list(pa.int64()), nullable=False), + pa.field("map", pa.map_(pa.large_string(), pa.int64()), nullable=False), + pa.field("double", pa.float64(), nullable=True), + pa.field("uuid", pa.binary(length=16), nullable=True), # can UUID is read as fixed length binary of length 16 + )) + ) + lhs = spark.table(f"{identifier}").toPandas() + rhs = written_arrow_table.to_pandas() + + for column in written_arrow_table.column_names: + for left, right in zip(lhs[column].to_list(), rhs[column].to_list()): + if column == "map": + # Arrow returns a list of tuples, instead of a dict + right = dict(right) + if column == "list": + # Arrow returns an array, convert to list for equality check + left, right = list(left), list(right) + if column == "uuid": + # Spark Iceberg represents UUID as hex string like '715a78ef-4e53-4089-9bf9-3ad0ee9bf545' + # whereas PyIceberg represents UUID as bytes on read + left, right = left.replace("-", ""), right.hex() + assert left == right + + +@pytest.mark.integration +@pytest.mark.parametrize("format_version", [1, 2]) +def test_add_files_subset_of_schema(spark: SparkSession, session_catalog: Catalog, format_version: int) -> None: + identifier = f"default.test_table_subset_of_schema{format_version}" + tbl = _create_table(session_catalog, identifier, format_version) + + file_path = f"s3://warehouse/default/test_add_files_subset_of_schema/v{format_version}/test.parquet" + arrow_table_without_some_columns = ARROW_TABLE.combine_chunks().drop(ARROW_TABLE.column_names[0]) + + # write parquet files + fo = tbl.io.new_output(file_path) + with fo.create(overwrite=True) as fos: + with pq.ParquetWriter(fos, schema=arrow_table_without_some_columns.schema) as writer: + writer.write_table(arrow_table_without_some_columns) + + tbl.add_files(file_paths=[file_path]) + written_arrow_table = tbl.scan().to_arrow() + assert tbl.scan().to_arrow() == pa.Table.from_pylist( + [ + { + "foo": None, # Missing column is read as None on read + "bar": "bar_string", + "baz": 123, + "qux": date(2024, 3, 7), + } + ], + schema=_pyarrow_schema_ensure_large_types(ARROW_SCHEMA), + ) + + lhs = spark.table(f"{identifier}").toPandas() + rhs = written_arrow_table.to_pandas() + + for column in written_arrow_table.column_names: + for left, right in zip(lhs[column].to_list(), rhs[column].to_list()): + assert left == right diff --git a/tests/integration/test_deletes.py b/tests/integration/test_deletes.py new file mode 100644 index 0000000000..d8fb01c447 --- /dev/null +++ b/tests/integration/test_deletes.py @@ -0,0 +1,419 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint:disable=redefined-outer-name +from typing import List + +import pyarrow as pa +import pytest +from pyspark.sql import SparkSession + +from pyiceberg.catalog.rest import RestCatalog +from pyiceberg.exceptions import NoSuchTableError +from pyiceberg.expressions import AlwaysTrue, EqualTo +from pyiceberg.manifest import ManifestEntryStatus +from pyiceberg.schema import Schema +from pyiceberg.table.snapshots import Operation, Summary +from pyiceberg.types import IntegerType, NestedField + + +def run_spark_commands(spark: SparkSession, sqls: List[str]) -> None: + for sql in sqls: + spark.sql(sql) + + +@pytest.mark.integration +@pytest.mark.parametrize("format_version", [1, 2]) +def test_partitioned_table_delete_full_file(spark: SparkSession, session_catalog: RestCatalog, format_version: int) -> None: + identifier = "default.table_partitioned_delete" + + run_spark_commands( + spark, + [ + f"DROP TABLE IF EXISTS {identifier}", + f""" + CREATE TABLE {identifier} ( + number_partitioned int, + number int + ) + USING iceberg + PARTITIONED BY (number_partitioned) + TBLPROPERTIES('format-version' = {format_version}) + """, + f""" + INSERT INTO {identifier} VALUES (10, 20), (10, 30) + """, + f""" + INSERT INTO {identifier} VALUES (11, 20), (11, 30) + """, + ], + ) + + tbl = session_catalog.load_table(identifier) + tbl.delete(EqualTo("number_partitioned", 10)) + + # No overwrite operation + assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()] == ["append", "append", "delete"] + assert tbl.scan().to_arrow().to_pydict() == {"number_partitioned": [11, 11], "number": [20, 30]} + + +@pytest.mark.integration +@pytest.mark.parametrize("format_version", [1, 2]) +def test_partitioned_table_rewrite(spark: SparkSession, session_catalog: RestCatalog, format_version: int) -> None: + identifier = "default.table_partitioned_delete" + + run_spark_commands( + spark, + [ + f"DROP TABLE IF EXISTS {identifier}", + f""" + CREATE TABLE {identifier} ( + number_partitioned int, + number int + ) + USING iceberg + PARTITIONED BY (number_partitioned) + TBLPROPERTIES('format-version' = {format_version}) + """, + f""" + INSERT INTO {identifier} VALUES (10, 20), (10, 30) + """, + f""" + INSERT INTO {identifier} VALUES (11, 20), (11, 30) + """, + ], + ) + + tbl = session_catalog.load_table(identifier) + tbl.delete(EqualTo("number", 20)) + + # We don't delete a whole partition, so there is only a overwrite + assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()] == ["append", "append", "overwrite"] + assert tbl.scan().to_arrow().to_pydict() == {"number_partitioned": [11, 10], "number": [30, 30]} + + +@pytest.mark.integration +@pytest.mark.parametrize("format_version", [1, 2]) +def test_partitioned_table_no_match(spark: SparkSession, session_catalog: RestCatalog, format_version: int) -> None: + identifier = "default.table_partitioned_delete" + + run_spark_commands( + spark, + [ + f"DROP TABLE IF EXISTS {identifier}", + f""" + CREATE TABLE {identifier} ( + number_partitioned int, + number int + ) + USING iceberg + PARTITIONED BY (number_partitioned) + TBLPROPERTIES('format-version' = {format_version}) + """, + f""" + INSERT INTO {identifier} VALUES (10, 20), (10, 30) + """, + ], + ) + + tbl = session_catalog.load_table(identifier) + tbl.delete(EqualTo("number_partitioned", 22)) # Does not affect any data + + assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()] == ["append"] + assert tbl.scan().to_arrow().to_pydict() == {"number_partitioned": [10, 10], "number": [20, 30]} + + +@pytest.mark.integration +def test_delete_partitioned_table_positional_deletes(spark: SparkSession, session_catalog: RestCatalog) -> None: + identifier = "default.table_partitioned_delete" + + run_spark_commands( + spark, + [ + f"DROP TABLE IF EXISTS {identifier}", + f""" + CREATE TABLE {identifier} ( + number_partitioned int, + number int + ) + USING iceberg + PARTITIONED BY (number_partitioned) + TBLPROPERTIES( + 'format-version' = 2, + 'write.delete.mode'='merge-on-read', + 'write.update.mode'='merge-on-read', + 'write.merge.mode'='merge-on-read' + ) + """, + f""" + INSERT INTO {identifier} VALUES (10, 20), (10, 30), (10, 40) + """, + # Generate a positional delete + f""" + DELETE FROM {identifier} WHERE number = 30 + """, + ], + ) + + tbl = session_catalog.load_table(identifier) + + # Assert that there is just a single Parquet file, that has one merge on read file + files = list(tbl.scan().plan_files()) + assert len(files) == 1 + assert len(files[0].delete_files) == 1 + + # Will rewrite a data file without the positional delete + tbl.delete(EqualTo("number", 40)) + + # One positional delete has been added, but an OVERWRITE status is set + # https://github.com/apache/iceberg/issues/10122 + assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()] == ["append", "overwrite", "overwrite"] + assert tbl.scan().to_arrow().to_pydict() == {"number_partitioned": [10], "number": [20]} + + +@pytest.mark.integration +def test_overwrite_partitioned_table(spark: SparkSession, session_catalog: RestCatalog) -> None: + identifier = "default.table_partitioned_delete" + + run_spark_commands( + spark, + [ + f"DROP TABLE IF EXISTS {identifier}", + f""" + CREATE TABLE {identifier} ( + number_partitioned int, + number int + ) + USING iceberg + PARTITIONED BY (number_partitioned) + TBLPROPERTIES( + 'format-version' = 2, + 'write.delete.mode'='merge-on-read', + 'write.update.mode'='merge-on-read', + 'write.merge.mode'='merge-on-read' + ) + """, + f""" + INSERT INTO {identifier} VALUES (10, 1), (10, 2), (20, 3) + """, + ], + ) + + tbl = session_catalog.load_table(identifier) + + files = list(tbl.scan().plan_files()) + assert len(files) == 2 + + arrow_schema = pa.schema([pa.field("number_partitioned", pa.int32()), pa.field("number", pa.int32())]) + arrow_tbl = pa.Table.from_pylist( + [ + {"number_partitioned": 10, "number": 4}, + {"number_partitioned": 10, "number": 5}, + ], + schema=arrow_schema, + ) + + # Will rewrite a data file without the positional delete + tbl.overwrite(arrow_tbl, "number_partitioned == 10") + + # One positional delete has been added, but an OVERWRITE status is set + # https://github.com/apache/iceberg/issues/10122 + assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()] == ["append", "delete", "append"] + assert tbl.scan().to_arrow().to_pydict() == {"number_partitioned": [10, 10, 20], "number": [4, 5, 3]} + + +@pytest.mark.integration +def test_partitioned_table_positional_deletes_sequence_number(spark: SparkSession, session_catalog: RestCatalog) -> None: + identifier = "default.table_partitioned_delete_sequence_number" + + # This test case is a bit more complex. Here we run a MoR delete on a file, we make sure that + # the manifest gets rewritten (but not the data file with a MoR), and check if the delete is still there + # to assure that the sequence numbers are maintained + + run_spark_commands( + spark, + [ + f"DROP TABLE IF EXISTS {identifier}", + f""" + CREATE TABLE {identifier} ( + number_partitioned int, + number int + ) + USING iceberg + PARTITIONED BY (number_partitioned) + TBLPROPERTIES( + 'format-version' = 2, + 'write.delete.mode'='merge-on-read', + 'write.update.mode'='merge-on-read', + 'write.merge.mode'='merge-on-read' + ) + """, + f""" + INSERT INTO {identifier} VALUES (10, 100), (10, 101), (20, 200), (20, 201), (20, 202) + """, + # Generate a positional delete + f""" + DELETE FROM {identifier} WHERE number = 101 + """, + ], + ) + + tbl = session_catalog.load_table(identifier) + + files = list(tbl.scan().plan_files()) + assert len(files) == 2 + + # Will rewrite a data file without a positional delete + tbl.delete(EqualTo("number", 201)) + + # One positional delete has been added, but an OVERWRITE status is set + # https://github.com/apache/iceberg/issues/10122 + snapshots = tbl.snapshots() + assert len(snapshots) == 3 + + # Snapshots produced by Spark + assert [snapshot.summary.operation.value for snapshot in tbl.snapshots()[0:2]] == ["append", "overwrite"] + + # Will rewrite one parquet file + assert snapshots[2].summary == Summary( + Operation.OVERWRITE, + **{ + "added-files-size": snapshots[2].summary["total-files-size"], + "added-data-files": "1", + "added-records": "2", + "changed-partition-count": "1", + "total-files-size": snapshots[2].summary["total-files-size"], + "total-delete-files": "0", + "total-data-files": "1", + "total-position-deletes": "0", + "total-records": "2", + "total-equality-deletes": "0", + "deleted-data-files": "2", + "removed-delete-files": "1", + "deleted-records": "5", + "removed-files-size": snapshots[2].summary["removed-files-size"], + "removed-position-deletes": "1", + }, + ) + + assert tbl.scan().to_arrow().to_pydict() == {"number_partitioned": [20, 20, 10], "number": [200, 202, 100]} + + +@pytest.mark.integration +def test_delete_no_match(session_catalog: RestCatalog) -> None: + arrow_schema = pa.schema([pa.field("ints", pa.int32())]) + arrow_tbl = pa.Table.from_pylist( + [ + {"ints": 1}, + {"ints": 3}, + ], + schema=arrow_schema, + ) + + iceberg_schema = Schema(NestedField(1, "ints", IntegerType())) + + tbl_identifier = "default.test_delete_no_match" + + try: + session_catalog.drop_table(tbl_identifier) + except NoSuchTableError: + pass + + tbl = session_catalog.create_table(tbl_identifier, iceberg_schema) + tbl.append(arrow_tbl) + + assert [snapshot.summary.operation for snapshot in tbl.snapshots()] == [Operation.APPEND] + + tbl.delete("ints == 2") # Only 1 and 3 in the file, but is between the lower and upper bound + + assert [snapshot.summary.operation for snapshot in tbl.snapshots()] == [Operation.APPEND] + + +@pytest.mark.integration +def test_delete_overwrite(session_catalog: RestCatalog) -> None: + arrow_schema = pa.schema([pa.field("ints", pa.int32())]) + arrow_tbl = pa.Table.from_pylist( + [ + {"ints": 1}, + {"ints": 2}, + ], + schema=arrow_schema, + ) + + iceberg_schema = Schema(NestedField(1, "ints", IntegerType())) + + tbl_identifier = "default.test_delete_overwrite" + + try: + session_catalog.drop_table(tbl_identifier) + except NoSuchTableError: + pass + + tbl = session_catalog.create_table(tbl_identifier, iceberg_schema) + tbl.append(arrow_tbl) + + assert [snapshot.summary.operation for snapshot in tbl.snapshots()] == [Operation.APPEND] + + arrow_tbl_overwrite = pa.Table.from_pylist( + [ + {"ints": 3}, + {"ints": 4}, + ], + schema=arrow_schema, + ) + tbl.overwrite(arrow_tbl_overwrite, "ints == 2") # Should rewrite one file + + assert [snapshot.summary.operation for snapshot in tbl.snapshots()] == [ + Operation.APPEND, + Operation.OVERWRITE, + Operation.APPEND, + ] + + assert tbl.scan().to_arrow()["ints"].to_pylist() == [3, 4, 1] + + +@pytest.mark.integration +def test_delete_truncate(session_catalog: RestCatalog) -> None: + arrow_schema = pa.schema([pa.field("ints", pa.int32())]) + arrow_tbl = pa.Table.from_pylist( + [ + {"ints": 1}, + ], + schema=arrow_schema, + ) + + iceberg_schema = Schema(NestedField(1, "ints", IntegerType())) + + tbl_identifier = "default.test_delete_overwrite" + + try: + session_catalog.drop_table(tbl_identifier) + except NoSuchTableError: + pass + + tbl = session_catalog.create_table(tbl_identifier, iceberg_schema) + tbl.append(arrow_tbl) + + # Effectively a truncate + tbl.delete(delete_filter=AlwaysTrue()) + + manifests = tbl.current_snapshot().manifests(tbl.io) + assert len(manifests) == 1 + + entries = manifests[0].fetch_manifest_entry(tbl.io, discard_deleted=False) + assert len(entries) == 1 + + assert entries[0].status == ManifestEntryStatus.DELETED diff --git a/tests/integration/test_inspect_table.py b/tests/integration/test_inspect_table.py index 834fe83d5f..9415d7146d 100644 --- a/tests/integration/test_inspect_table.py +++ b/tests/integration/test_inspect_table.py @@ -103,21 +103,38 @@ def test_inspect_snapshots( assert isinstance(snapshot_id.as_py(), int) assert df["parent_id"][0].as_py() is None - assert df["parent_id"][1:] == df["snapshot_id"][:2] + assert df["parent_id"][1:].to_pylist() == df["snapshot_id"][:-1].to_pylist() - assert [operation.as_py() for operation in df["operation"]] == ["append", "overwrite", "append"] + assert [operation.as_py() for operation in df["operation"]] == ["append", "delete", "append", "append"] for manifest_list in df["manifest_list"]: assert manifest_list.as_py().startswith("s3://") + file_size = int(next(value for key, value in df["summary"][0].as_py() if key == "added-files-size")) + assert file_size > 0 + + # Append assert df["summary"][0].as_py() == [ - ("added-files-size", "5459"), + ("added-files-size", str(file_size)), ("added-data-files", "1"), ("added-records", "3"), ("total-data-files", "1"), ("total-delete-files", "0"), ("total-records", "3"), - ("total-files-size", "5459"), + ("total-files-size", str(file_size)), + ("total-position-deletes", "0"), + ("total-equality-deletes", "0"), + ] + + # Delete + assert df["summary"][1].as_py() == [ + ("removed-files-size", str(file_size)), + ("deleted-data-files", "1"), + ("deleted-records", "3"), + ("total-data-files", "0"), + ("total-delete-files", "0"), + ("total-records", "0"), + ("total-files-size", "0"), ("total-position-deletes", "0"), ("total-equality-deletes", "0"), ] diff --git a/tests/integration/test_rest_schema.py b/tests/integration/test_rest_schema.py index f4ab98a883..644cb8053d 100644 --- a/tests/integration/test_rest_schema.py +++ b/tests/integration/test_rest_schema.py @@ -2512,14 +2512,18 @@ def test_two_add_schemas_in_a_single_transaction(catalog: Catalog) -> None: ), ) - with pytest.raises(CommitFailedException) as exc_info: - with tbl.transaction() as tr: - with tr.update_schema() as update: - update.add_column("bar", field_type=StringType()) - with tr.update_schema() as update: - update.add_column("baz", field_type=StringType()) + with tbl.transaction() as tr: + with tr.update_schema() as update: + update.add_column("bar", field_type=StringType()) + with tr.update_schema() as update: + update.add_column("baz", field_type=StringType()) - assert "CommitFailedException: Requirement failed: current schema changed: expected id 1 != 0" in str(exc_info.value) + assert tbl.schema().schema_id == 2 + assert tbl.schema() == Schema( + NestedField(field_id=1, name="foo", field_type=StringType()), + NestedField(field_id=2, name="bar", field_type=StringType()), + NestedField(field_id=3, name="baz", field_type=StringType()), + ) @pytest.mark.integration diff --git a/tests/integration/test_writes/test_partitioned_writes.py b/tests/integration/test_writes/test_partitioned_writes.py index f6e6e93c11..b199f00210 100644 --- a/tests/integration/test_writes/test_partitioned_writes.py +++ b/tests/integration/test_writes/test_partitioned_writes.py @@ -38,7 +38,6 @@ TruncateTransform, YearTransform, ) -from tests.conftest import TEST_DATA_WITH_NULL from utils import TABLE_SCHEMA, _create_table @@ -70,7 +69,7 @@ def test_query_filter_null_partitioned( assert tbl.format_version == format_version, f"Expected v{format_version}, got: v{tbl.format_version}" df = spark.table(identifier) assert df.count() == 3, f"Expected 3 total rows for {identifier}" - for col in TEST_DATA_WITH_NULL.keys(): + for col in arrow_table_with_null.column_names: assert df.where(f"{col} is not null").count() == 2, f"Expected 2 non-null rows for {col}" assert df.where(f"{col} is null").count() == 1, f"Expected 1 null row for {col} is null" @@ -81,7 +80,12 @@ def test_query_filter_null_partitioned( ) @pytest.mark.parametrize("format_version", [1, 2]) def test_query_filter_without_data_partitioned( - session_catalog: Catalog, spark: SparkSession, arrow_table_without_data: pa.Table, part_col: str, format_version: int + session_catalog: Catalog, + spark: SparkSession, + arrow_table_without_data: pa.Table, + part_col: str, + arrow_table_with_null: pa.Table, + format_version: int, ) -> None: # Given identifier = f"default.arrow_table_v{format_version}_without_data_partitioned_on_col_{part_col}" @@ -102,7 +106,7 @@ def test_query_filter_without_data_partitioned( # Then assert tbl.format_version == format_version, f"Expected v{format_version}, got: v{tbl.format_version}" df = spark.table(identifier) - for col in TEST_DATA_WITH_NULL.keys(): + for col in arrow_table_with_null.column_names: assert df.where(f"{col} is null").count() == 0, f"Expected 0 row for {col}" assert df.where(f"{col} is not null").count() == 0, f"Expected 0 row for {col}" @@ -134,7 +138,7 @@ def test_query_filter_only_nulls_partitioned( # Then assert tbl.format_version == format_version, f"Expected v{format_version}, got: v{tbl.format_version}" df = spark.table(identifier) - for col in TEST_DATA_WITH_NULL.keys(): + for col in arrow_table_with_only_nulls.column_names: assert df.where(f"{col} is null").count() == 2, f"Expected 2 row for {col}" assert df.where(f"{col} is not null").count() == 0, f"Expected 0 rows for {col}" @@ -169,7 +173,7 @@ def test_query_filter_appended_null_partitioned( # Then assert tbl.format_version == format_version, f"Expected v{format_version}, got: v{tbl.format_version}" df = spark.table(identifier) - for col in TEST_DATA_WITH_NULL.keys(): + for col in arrow_table_with_null.column_names: assert df.where(f"{col} is not null").count() == 6, f"Expected 6 non-null rows for {col}" assert df.where(f"{col} is null").count() == 3, f"Expected 3 null rows for {col}" # expecting 6 files: first append with [A], [B], [C], second append with [A, A], [B, B], [C, C] @@ -212,7 +216,7 @@ def test_query_filter_v1_v2_append_null( # Then assert tbl.format_version == 2, f"Expected v2, got: v{tbl.format_version}" - for col in TEST_DATA_WITH_NULL.keys(): # type: ignore + for col in arrow_table_with_null.column_names: # type: ignore df = spark.table(identifier) assert df.where(f"{col} is not null").count() == 4, f"Expected 4 non-null rows for {col}" assert df.where(f"{col} is null").count() == 2, f"Expected 2 null rows for {col}" @@ -248,15 +252,19 @@ def test_summaries_with_null(spark: SparkSession, session_catalog: Catalog, arro assert operations == ["append", "append"] summaries = [row.summary for row in rows] + + file_size = int(summaries[0]["added-files-size"]) + assert file_size > 0 + assert summaries[0] == { "changed-partition-count": "3", "added-data-files": "3", - "added-files-size": "15029", + "added-files-size": str(file_size), "added-records": "3", "total-data-files": "3", "total-delete-files": "0", "total-equality-deletes": "0", - "total-files-size": "15029", + "total-files-size": str(file_size), "total-position-deletes": "0", "total-records": "3", } @@ -264,12 +272,12 @@ def test_summaries_with_null(spark: SparkSession, session_catalog: Catalog, arro assert summaries[1] == { "changed-partition-count": "3", "added-data-files": "3", - "added-files-size": "15029", + "added-files-size": str(file_size), "added-records": "3", "total-data-files": "6", "total-delete-files": "0", "total-equality-deletes": "0", - "total-files-size": "30058", + "total-files-size": str(file_size * 2), "total-position-deletes": "0", "total-records": "6", } @@ -422,7 +430,7 @@ def test_append_ymd_transform_partitioned( assert tbl.format_version == format_version, f"Expected v{format_version}, got: v{tbl.format_version}" df = spark.table(identifier) assert df.count() == 3, f"Expected 3 total rows for {identifier}" - for col in TEST_DATA_WITH_NULL.keys(): + for col in arrow_table_with_null.column_names: assert df.where(f"{col} is not null").count() == 2, f"Expected 2 non-null rows for {col}" assert df.where(f"{col} is null").count() == 1, f"Expected 1 null row for {col} is null" @@ -453,7 +461,7 @@ def test_append_transform_partition_verify_partitions_count( session_catalog: Catalog, spark: SparkSession, arrow_table_date_timestamps: pa.Table, - arrow_table_date_timestamps_schema: Schema, + table_date_timestamps_schema: Schema, transform: Transform[Any, Any], expected_partitions: Set[Any], format_version: int, @@ -461,7 +469,7 @@ def test_append_transform_partition_verify_partitions_count( # Given part_col = "timestamptz" identifier = f"default.arrow_table_v{format_version}_with_{str(transform)}_transform_partitioned_on_col_{part_col}" - nested_field = arrow_table_date_timestamps_schema.find_field(part_col) + nested_field = table_date_timestamps_schema.find_field(part_col) partition_spec = PartitionSpec( PartitionField(source_id=nested_field.field_id, field_id=1001, transform=transform, name=part_col), ) @@ -473,7 +481,7 @@ def test_append_transform_partition_verify_partitions_count( properties={"format-version": str(format_version)}, data=[arrow_table_date_timestamps], partition_spec=partition_spec, - schema=arrow_table_date_timestamps_schema, + schema=table_date_timestamps_schema, ) # Then @@ -502,20 +510,20 @@ def test_append_multiple_partitions( session_catalog: Catalog, spark: SparkSession, arrow_table_date_timestamps: pa.Table, - arrow_table_date_timestamps_schema: Schema, + table_date_timestamps_schema: Schema, format_version: int, ) -> None: # Given identifier = f"default.arrow_table_v{format_version}_with_multiple_partitions" partition_spec = PartitionSpec( PartitionField( - source_id=arrow_table_date_timestamps_schema.find_field("date").field_id, + source_id=table_date_timestamps_schema.find_field("date").field_id, field_id=1001, transform=YearTransform(), name="date_year", ), PartitionField( - source_id=arrow_table_date_timestamps_schema.find_field("timestamptz").field_id, + source_id=table_date_timestamps_schema.find_field("timestamptz").field_id, field_id=1000, transform=HourTransform(), name="timestamptz_hour", @@ -529,7 +537,7 @@ def test_append_multiple_partitions( properties={"format-version": str(format_version)}, data=[arrow_table_date_timestamps], partition_spec=partition_spec, - schema=arrow_table_date_timestamps_schema, + schema=table_date_timestamps_schema, ) # Then diff --git a/tests/integration/test_writes/test_writes.py b/tests/integration/test_writes/test_writes.py index 4585406cbb..09fe654d29 100644 --- a/tests/integration/test_writes/test_writes.py +++ b/tests/integration/test_writes/test_writes.py @@ -23,6 +23,7 @@ from typing import Any, Dict from urllib.parse import urlparse +import pandas as pd import pyarrow as pa import pyarrow.parquet as pq import pytest @@ -37,12 +38,12 @@ from pyiceberg.catalog.rest import RestCatalog from pyiceberg.catalog.sql import SqlCatalog from pyiceberg.exceptions import NoSuchTableError +from pyiceberg.io.pyarrow import _dataframe_to_data_files from pyiceberg.partitioning import PartitionField, PartitionSpec from pyiceberg.schema import Schema -from pyiceberg.table import TableProperties, _dataframe_to_data_files +from pyiceberg.table import TableProperties from pyiceberg.transforms import IdentityTransform -from pyiceberg.types import IntegerType, NestedField -from tests.conftest import TEST_DATA_WITH_NULL +from pyiceberg.types import IntegerType, LongType, NestedField from utils import _create_table @@ -124,52 +125,55 @@ def test_query_count(spark: SparkSession, format_version: int) -> None: @pytest.mark.integration -@pytest.mark.parametrize("col", TEST_DATA_WITH_NULL.keys()) @pytest.mark.parametrize("format_version", [1, 2]) -def test_query_filter_null(spark: SparkSession, col: str, format_version: int) -> None: +def test_query_filter_null(spark: SparkSession, arrow_table_with_null: pa.Table, format_version: int) -> None: identifier = f"default.arrow_table_v{format_version}_with_null" df = spark.table(identifier) - assert df.where(f"{col} is null").count() == 1, f"Expected 1 row for {col}" - assert df.where(f"{col} is not null").count() == 2, f"Expected 2 rows for {col}" + for col in arrow_table_with_null.column_names: + assert df.where(f"{col} is null").count() == 1, f"Expected 1 row for {col}" + assert df.where(f"{col} is not null").count() == 2, f"Expected 2 rows for {col}" @pytest.mark.integration -@pytest.mark.parametrize("col", TEST_DATA_WITH_NULL.keys()) @pytest.mark.parametrize("format_version", [1, 2]) -def test_query_filter_without_data(spark: SparkSession, col: str, format_version: int) -> None: +def test_query_filter_without_data(spark: SparkSession, arrow_table_with_null: pa.Table, format_version: int) -> None: identifier = f"default.arrow_table_v{format_version}_without_data" df = spark.table(identifier) - assert df.where(f"{col} is null").count() == 0, f"Expected 0 row for {col}" - assert df.where(f"{col} is not null").count() == 0, f"Expected 0 row for {col}" + for col in arrow_table_with_null.column_names: + assert df.where(f"{col} is null").count() == 0, f"Expected 0 row for {col}" + assert df.where(f"{col} is not null").count() == 0, f"Expected 0 row for {col}" @pytest.mark.integration -@pytest.mark.parametrize("col", TEST_DATA_WITH_NULL.keys()) @pytest.mark.parametrize("format_version", [1, 2]) -def test_query_filter_only_nulls(spark: SparkSession, col: str, format_version: int) -> None: +def test_query_filter_only_nulls(spark: SparkSession, arrow_table_with_null: pa.Table, format_version: int) -> None: identifier = f"default.arrow_table_v{format_version}_with_only_nulls" df = spark.table(identifier) - assert df.where(f"{col} is null").count() == 2, f"Expected 2 rows for {col}" - assert df.where(f"{col} is not null").count() == 0, f"Expected 0 row for {col}" + for col in arrow_table_with_null.column_names: + assert df.where(f"{col} is null").count() == 2, f"Expected 2 rows for {col}" + assert df.where(f"{col} is not null").count() == 0, f"Expected 0 row for {col}" @pytest.mark.integration -@pytest.mark.parametrize("col", TEST_DATA_WITH_NULL.keys()) @pytest.mark.parametrize("format_version", [1, 2]) -def test_query_filter_appended_null(spark: SparkSession, col: str, format_version: int) -> None: +def test_query_filter_appended_null(spark: SparkSession, arrow_table_with_null: pa.Table, format_version: int) -> None: identifier = f"default.arrow_table_v{format_version}_appended_with_null" df = spark.table(identifier) - assert df.where(f"{col} is null").count() == 2, f"Expected 1 row for {col}" - assert df.where(f"{col} is not null").count() == 4, f"Expected 2 rows for {col}" + for col in arrow_table_with_null.column_names: + assert df.where(f"{col} is null").count() == 2, f"Expected 1 row for {col}" + assert df.where(f"{col} is not null").count() == 4, f"Expected 2 rows for {col}" @pytest.mark.integration -@pytest.mark.parametrize("col", TEST_DATA_WITH_NULL.keys()) -def test_query_filter_v1_v2_append_null(spark: SparkSession, col: str) -> None: +def test_query_filter_v1_v2_append_null( + spark: SparkSession, + arrow_table_with_null: pa.Table, +) -> None: identifier = "default.arrow_table_v1_v2_appended_with_null" df = spark.table(identifier) - assert df.where(f"{col} is null").count() == 2, f"Expected 1 row for {col}" - assert df.where(f"{col} is not null").count() == 4, f"Expected 2 rows for {col}" + for col in arrow_table_with_null.column_names: + assert df.where(f"{col} is null").count() == 2, f"Expected 1 row for {col}" + assert df.where(f"{col} is not null").count() == 4, f"Expected 2 rows for {col}" @pytest.mark.integration @@ -187,45 +191,61 @@ def test_summaries(spark: SparkSession, session_catalog: Catalog, arrow_table_wi ).collect() operations = [row.operation for row in rows] - assert operations == ["append", "append", "overwrite"] + assert operations == ["append", "append", "delete", "append"] summaries = [row.summary for row in rows] + file_size = int(summaries[0]["added-files-size"]) + assert file_size > 0 + + # Append assert summaries[0] == { "added-data-files": "1", - "added-files-size": "5459", + "added-files-size": str(file_size), "added-records": "3", "total-data-files": "1", "total-delete-files": "0", "total-equality-deletes": "0", - "total-files-size": "5459", + "total-files-size": str(file_size), "total-position-deletes": "0", "total-records": "3", } + # Append assert summaries[1] == { "added-data-files": "1", - "added-files-size": "5459", + "added-files-size": str(file_size), "added-records": "3", "total-data-files": "2", "total-delete-files": "0", "total-equality-deletes": "0", - "total-files-size": "10918", + "total-files-size": str(file_size * 2), "total-position-deletes": "0", "total-records": "6", } + # Delete assert summaries[2] == { - "added-data-files": "1", - "added-files-size": "5459", - "added-records": "3", "deleted-data-files": "2", "deleted-records": "6", - "removed-files-size": "10918", + "removed-files-size": str(file_size * 2), + "total-data-files": "0", + "total-delete-files": "0", + "total-equality-deletes": "0", + "total-files-size": "0", + "total-position-deletes": "0", + "total-records": "0", + } + + # Overwrite + assert summaries[3] == { + "added-data-files": "1", + "added-files-size": str(file_size), + "added-records": "3", "total-data-files": "1", "total-delete-files": "0", "total-equality-deletes": "0", - "total-files-size": "5459", + "total-files-size": str(file_size), "total-position-deletes": "0", "total-records": "3", } @@ -249,9 +269,9 @@ def test_data_files(spark: SparkSession, session_catalog: Catalog, arrow_table_w """ ).collect() - assert [row.added_data_files_count for row in rows] == [1, 1, 0, 1, 1] + assert [row.added_data_files_count for row in rows] == [1, 0, 1, 1, 1] assert [row.existing_data_files_count for row in rows] == [0, 0, 0, 0, 0] - assert [row.deleted_data_files_count for row in rows] == [0, 0, 1, 0, 0] + assert [row.deleted_data_files_count for row in rows] == [0, 1, 0, 0, 0] @pytest.mark.integration @@ -556,10 +576,13 @@ def test_summaries_with_only_nulls( ).collect() operations = [row.operation for row in rows] - assert operations == ["append", "append", "overwrite"] + assert operations == ["append", "append", "delete", "append"] summaries = [row.summary for row in rows] + file_size = int(summaries[1]["added-files-size"]) + assert file_size > 0 + assert summaries[0] == { "total-data-files": "0", "total-delete-files": "0", @@ -571,25 +594,34 @@ def test_summaries_with_only_nulls( assert summaries[1] == { "added-data-files": "1", - "added-files-size": "4239", + "added-files-size": str(file_size), "added-records": "2", "total-data-files": "1", "total-delete-files": "0", "total-equality-deletes": "0", - "total-files-size": "4239", + "total-files-size": str(file_size), "total-position-deletes": "0", "total-records": "2", } assert summaries[2] == { - "removed-files-size": "4239", - "total-equality-deletes": "0", - "total-position-deletes": "0", "deleted-data-files": "1", + "deleted-records": "2", + "removed-files-size": str(file_size), + "total-data-files": "0", "total-delete-files": "0", + "total-equality-deletes": "0", "total-files-size": "0", - "deleted-records": "2", + "total-position-deletes": "0", + "total-records": "0", + } + + assert summaries[3] == { "total-data-files": "0", + "total-delete-files": "0", + "total-equality-deletes": "0", + "total-files-size": "0", + "total-position-deletes": "0", "total-records": "0", } @@ -812,21 +844,38 @@ def test_inspect_snapshots( assert isinstance(snapshot_id.as_py(), int) assert df["parent_id"][0].as_py() is None - assert df["parent_id"][1:] == df["snapshot_id"][:2] + assert df["parent_id"][1:].to_pylist() == df["snapshot_id"][:-1].to_pylist() - assert [operation.as_py() for operation in df["operation"]] == ["append", "overwrite", "append"] + assert [operation.as_py() for operation in df["operation"]] == ["append", "delete", "append", "append"] for manifest_list in df["manifest_list"]: assert manifest_list.as_py().startswith("s3://") + file_size = int(next(value for key, value in df["summary"][0].as_py() if key == "added-files-size")) + assert file_size > 0 + + # Append assert df["summary"][0].as_py() == [ - ("added-files-size", "5459"), + ("added-files-size", str(file_size)), ("added-data-files", "1"), ("added-records", "3"), ("total-data-files", "1"), ("total-delete-files", "0"), ("total-records", "3"), - ("total-files-size", "5459"), + ("total-files-size", str(file_size)), + ("total-position-deletes", "0"), + ("total-equality-deletes", "0"), + ] + + # Delete + assert df["summary"][1].as_py() == [ + ("removed-files-size", str(file_size)), + ("deleted-data-files", "1"), + ("deleted-records", "3"), + ("total-data-files", "0"), + ("total-delete-files", "0"), + ("total-records", "0"), + ("total-files-size", "0"), ("total-position-deletes", "0"), ("total-equality-deletes", "0"), ] @@ -915,9 +964,10 @@ def test_sanitize_character_partitioned(catalog: Catalog) -> None: assert len(tbl.scan().to_arrow()) == 22 +@pytest.mark.integration @pytest.mark.parametrize("format_version", [1, 2]) -def table_write_subset_of_schema(session_catalog: Catalog, arrow_table_with_null: pa.Table, format_version: int) -> None: - identifier = "default.table_append_subset_of_schema" +def test_table_write_subset_of_schema(session_catalog: Catalog, arrow_table_with_null: pa.Table, format_version: int) -> None: + identifier = "default.test_table_write_subset_of_schema" tbl = _create_table(session_catalog, identifier, {"format-version": format_version}, [arrow_table_with_null]) arrow_table_without_some_columns = arrow_table_with_null.combine_chunks().drop(arrow_table_with_null.column_names[0]) assert len(arrow_table_without_some_columns.columns) < len(arrow_table_with_null.columns) @@ -925,3 +975,310 @@ def table_write_subset_of_schema(session_catalog: Catalog, arrow_table_with_null tbl.append(arrow_table_without_some_columns) # overwrite and then append should produce twice the data assert len(tbl.scan().to_arrow()) == len(arrow_table_without_some_columns) * 2 + + +@pytest.mark.integration +@pytest.mark.parametrize("format_version", [1, 2]) +def test_table_write_out_of_order_schema(session_catalog: Catalog, arrow_table_with_null: pa.Table, format_version: int) -> None: + identifier = "default.test_table_write_out_of_order_schema" + # rotate the schema fields by 1 + fields = list(arrow_table_with_null.schema) + rotated_fields = fields[1:] + fields[:1] + rotated_schema = pa.schema(rotated_fields) + assert arrow_table_with_null.schema != rotated_schema + tbl = _create_table(session_catalog, identifier, {"format-version": format_version}, schema=rotated_schema) + + tbl.overwrite(arrow_table_with_null) + tbl.append(arrow_table_with_null) + # overwrite and then append should produce twice the data + assert len(tbl.scan().to_arrow()) == len(arrow_table_with_null) * 2 + + +@pytest.mark.integration +@pytest.mark.parametrize("format_version", [1, 2]) +def test_table_write_schema_with_valid_nullability_diff( + spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: pa.Table, format_version: int +) -> None: + identifier = "default.test_table_write_with_valid_nullability_diff" + table_schema = Schema( + NestedField(field_id=1, name="long", field_type=LongType(), required=False), + ) + other_schema = pa.schema(( + pa.field("long", pa.int64(), nullable=False), # can support writing required pyarrow field to optional Iceberg field + )) + arrow_table = pa.Table.from_pydict( + { + "long": [1, 9], + }, + schema=other_schema, + ) + tbl = _create_table(session_catalog, identifier, {"format-version": format_version}, [arrow_table], schema=table_schema) + # table's long field should cast to be optional on read + written_arrow_table = tbl.scan().to_arrow() + assert written_arrow_table == arrow_table.cast(pa.schema((pa.field("long", pa.int64(), nullable=True),))) + lhs = spark.table(f"{identifier}").toPandas() + rhs = written_arrow_table.to_pandas() + + for column in written_arrow_table.column_names: + for left, right in zip(lhs[column].to_list(), rhs[column].to_list()): + assert left == right + + +@pytest.mark.integration +@pytest.mark.parametrize("format_version", [1, 2]) +def test_table_write_schema_with_valid_upcast( + spark: SparkSession, + session_catalog: Catalog, + format_version: int, + table_schema_with_promoted_types: Schema, + pyarrow_schema_with_promoted_types: pa.Schema, + pyarrow_table_with_promoted_types: pa.Table, +) -> None: + identifier = "default.test_table_write_with_valid_upcast" + + tbl = _create_table( + session_catalog, + identifier, + {"format-version": format_version}, + [pyarrow_table_with_promoted_types], + schema=table_schema_with_promoted_types, + ) + # table's long field should cast to long on read + written_arrow_table = tbl.scan().to_arrow() + assert written_arrow_table == pyarrow_table_with_promoted_types.cast( + pa.schema(( + pa.field("long", pa.int64(), nullable=True), + pa.field("list", pa.large_list(pa.int64()), nullable=False), + pa.field("map", pa.map_(pa.large_string(), pa.int64()), nullable=False), + pa.field("double", pa.float64(), nullable=True), # can support upcasting float to double + pa.field("uuid", pa.binary(length=16), nullable=True), # can UUID is read as fixed length binary of length 16 + )) + ) + lhs = spark.table(f"{identifier}").toPandas() + rhs = written_arrow_table.to_pandas() + + for column in written_arrow_table.column_names: + for left, right in zip(lhs[column].to_list(), rhs[column].to_list()): + if column == "map": + # Arrow returns a list of tuples, instead of a dict + right = dict(right) + if column == "list": + # Arrow returns an array, convert to list for equality check + left, right = list(left), list(right) + if column == "uuid": + # Spark Iceberg represents UUID as hex string like '715a78ef-4e53-4089-9bf9-3ad0ee9bf545' + # whereas PyIceberg represents UUID as bytes on read + left, right = left.replace("-", ""), right.hex() + assert left == right + + +@pytest.mark.integration +@pytest.mark.parametrize("format_version", [1, 2]) +def test_write_all_timestamp_precision( + mocker: MockerFixture, + spark: SparkSession, + session_catalog: Catalog, + format_version: int, + arrow_table_schema_with_all_timestamp_precisions: pa.Schema, + arrow_table_with_all_timestamp_precisions: pa.Table, + arrow_table_schema_with_all_microseconds_timestamp_precisions: pa.Schema, +) -> None: + identifier = "default.table_all_timestamp_precision" + mocker.patch.dict(os.environ, values={"PYICEBERG_DOWNCAST_NS_TIMESTAMP_TO_US_ON_WRITE": "True"}) + + tbl = _create_table( + session_catalog, + identifier, + {"format-version": format_version}, + data=[arrow_table_with_all_timestamp_precisions], + schema=arrow_table_schema_with_all_timestamp_precisions, + ) + tbl.overwrite(arrow_table_with_all_timestamp_precisions) + written_arrow_table = tbl.scan().to_arrow() + + assert written_arrow_table.schema == arrow_table_schema_with_all_microseconds_timestamp_precisions + assert written_arrow_table == arrow_table_with_all_timestamp_precisions.cast( + arrow_table_schema_with_all_microseconds_timestamp_precisions, safe=False + ) + lhs = spark.table(f"{identifier}").toPandas() + rhs = written_arrow_table.to_pandas() + + for column in written_arrow_table.column_names: + for left, right in zip(lhs[column].to_list(), rhs[column].to_list()): + if pd.isnull(left): + assert pd.isnull(right) + else: + # Check only upto microsecond precision since Spark loaded dtype is timezone unaware + # and supports upto microsecond precision + assert left.timestamp() == right.timestamp(), f"Difference in column {column}: {left} != {right}" + + +@pytest.mark.integration +@pytest.mark.parametrize("format_version", [1, 2]) +def test_merge_manifests(session_catalog: Catalog, arrow_table_with_null: pa.Table, format_version: int) -> None: + tbl_a = _create_table( + session_catalog, + "default.merge_manifest_a", + {"commit.manifest-merge.enabled": "true", "commit.manifest.min-count-to-merge": "1", "format-version": format_version}, + [], + ) + tbl_b = _create_table( + session_catalog, + "default.merge_manifest_b", + { + "commit.manifest-merge.enabled": "true", + "commit.manifest.min-count-to-merge": "1", + "commit.manifest.target-size-bytes": "1", + "format-version": format_version, + }, + [], + ) + tbl_c = _create_table( + session_catalog, + "default.merge_manifest_c", + {"commit.manifest.min-count-to-merge": "1", "format-version": format_version}, + [], + ) + + # tbl_a should merge all manifests into 1 + tbl_a.append(arrow_table_with_null) + tbl_a.append(arrow_table_with_null) + tbl_a.append(arrow_table_with_null) + + # tbl_b should not merge any manifests because the target size is too small + tbl_b.append(arrow_table_with_null) + tbl_b.append(arrow_table_with_null) + tbl_b.append(arrow_table_with_null) + + # tbl_c should not merge any manifests because merging is disabled + tbl_c.append(arrow_table_with_null) + tbl_c.append(arrow_table_with_null) + tbl_c.append(arrow_table_with_null) + + assert len(tbl_a.current_snapshot().manifests(tbl_a.io)) == 1 # type: ignore + assert len(tbl_b.current_snapshot().manifests(tbl_b.io)) == 3 # type: ignore + assert len(tbl_c.current_snapshot().manifests(tbl_c.io)) == 3 # type: ignore + + # tbl_a and tbl_c should contain the same data + assert tbl_a.scan().to_arrow().equals(tbl_c.scan().to_arrow()) + # tbl_b and tbl_c should contain the same data + assert tbl_b.scan().to_arrow().equals(tbl_c.scan().to_arrow()) + + +@pytest.mark.integration +@pytest.mark.parametrize("format_version", [1, 2]) +def test_merge_manifests_file_content(session_catalog: Catalog, arrow_table_with_null: pa.Table, format_version: int) -> None: + tbl_a = _create_table( + session_catalog, + "default.merge_manifest_a", + {"commit.manifest-merge.enabled": "true", "commit.manifest.min-count-to-merge": "1", "format-version": format_version}, + [], + ) + + # tbl_a should merge all manifests into 1 + tbl_a.append(arrow_table_with_null) + + tbl_a_first_entries = tbl_a.inspect.entries().to_pydict() + first_snapshot_id = tbl_a_first_entries["snapshot_id"][0] + first_data_file_path = tbl_a_first_entries["data_file"][0]["file_path"] + + tbl_a.append(arrow_table_with_null) + tbl_a.append(arrow_table_with_null) + + assert len(tbl_a.current_snapshot().manifests(tbl_a.io)) == 1 # type: ignore + + # verify the sequence number of tbl_a's only manifest file + tbl_a_manifest = tbl_a.current_snapshot().manifests(tbl_a.io)[0] # type: ignore + assert tbl_a_manifest.sequence_number == (3 if format_version == 2 else 0) + assert tbl_a_manifest.min_sequence_number == (1 if format_version == 2 else 0) + + # verify the manifest entries of tbl_a, in which the manifests are merged + tbl_a_entries = tbl_a.inspect.entries().to_pydict() + assert tbl_a_entries["status"] == [1, 0, 0] + assert tbl_a_entries["sequence_number"] == [3, 2, 1] if format_version == 2 else [0, 0, 0] + assert tbl_a_entries["file_sequence_number"] == [3, 2, 1] if format_version == 2 else [0, 0, 0] + for i in range(3): + tbl_a_data_file = tbl_a_entries["data_file"][i] + assert tbl_a_data_file["column_sizes"] == [ + (1, 49), + (2, 78), + (3, 128), + (4, 94), + (5, 118), + (6, 94), + (7, 118), + (8, 118), + (9, 118), + (10, 94), + (11, 78), + (12, 109), + ] + assert tbl_a_data_file["content"] == 0 + assert tbl_a_data_file["equality_ids"] is None + assert tbl_a_data_file["file_format"] == "PARQUET" + assert tbl_a_data_file["file_path"].startswith("s3://warehouse/default/merge_manifest_a/data/") + if tbl_a_data_file["file_path"] == first_data_file_path: + # verify that the snapshot id recorded should be the one where the file was added + assert tbl_a_entries["snapshot_id"][i] == first_snapshot_id + assert tbl_a_data_file["key_metadata"] is None + assert tbl_a_data_file["lower_bounds"] == [ + (1, b"\x00"), + (2, b"a"), + (3, b"aaaaaaaaaaaaaaaa"), + (4, b"\x01\x00\x00\x00"), + (5, b"\x01\x00\x00\x00\x00\x00\x00\x00"), + (6, b"\x00\x00\x00\x80"), + (7, b"\x00\x00\x00\x00\x00\x00\x00\x80"), + (8, b"\x00\x9bj\xca8\xf1\x05\x00"), + (9, b"\x00\x9bj\xca8\xf1\x05\x00"), + (10, b"\x9eK\x00\x00"), + (11, b"\x01"), + (12, b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" b"\x00\x00\x00\x00"), + ] + assert tbl_a_data_file["nan_value_counts"] == [] + assert tbl_a_data_file["null_value_counts"] == [ + (1, 1), + (2, 1), + (3, 1), + (4, 1), + (5, 1), + (6, 1), + (7, 1), + (8, 1), + (9, 1), + (10, 1), + (11, 1), + (12, 1), + ] + assert tbl_a_data_file["partition"] == {} + assert tbl_a_data_file["record_count"] == 3 + assert tbl_a_data_file["sort_order_id"] is None + assert tbl_a_data_file["split_offsets"] == [4] + assert tbl_a_data_file["upper_bounds"] == [ + (1, b"\x01"), + (2, b"z"), + (3, b"zzzzzzzzzzzzzzz{"), + (4, b"\t\x00\x00\x00"), + (5, b"\t\x00\x00\x00\x00\x00\x00\x00"), + (6, b"fff?"), + (7, b"\xcd\xcc\xcc\xcc\xcc\xcc\xec?"), + (8, b"\x00\xbb\r\xab\xdb\xf5\x05\x00"), + (9, b"\x00\xbb\r\xab\xdb\xf5\x05\x00"), + (10, b"\xd9K\x00\x00"), + (11, b"\x12"), + (12, b"\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11\x11" b"\x11\x11\x11\x11"), + ] + assert tbl_a_data_file["value_counts"] == [ + (1, 3), + (2, 3), + (3, 3), + (4, 3), + (5, 3), + (6, 3), + (7, 3), + (8, 3), + (9, 3), + (10, 3), + (11, 3), + (12, 3), + ] diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py index ecb946a98b..d61a50bb0d 100644 --- a/tests/io/test_pyarrow.py +++ b/tests/io/test_pyarrow.py @@ -60,20 +60,24 @@ PyArrowFile, PyArrowFileIO, StatsAggregator, + _check_pyarrow_schema_compatible, _ConvertToArrowSchema, + _determine_partitions, _primitive_to_physical, _read_deletes, + _to_requested_schema, bin_pack_arrow_table, expression_to_pyarrow, project_table, schema_to_pyarrow, ) from pyiceberg.manifest import DataFile, DataFileContent, FileFormat -from pyiceberg.partitioning import PartitionSpec +from pyiceberg.partitioning import PartitionField, PartitionSpec from pyiceberg.schema import Schema, make_compatible_name, visit from pyiceberg.table import FileScanTask, TableProperties from pyiceberg.table.metadata import TableMetadataV2 -from pyiceberg.typedef import UTF8 +from pyiceberg.transforms import IdentityTransform +from pyiceberg.typedef import UTF8, Record from pyiceberg.types import ( BinaryType, BooleanType, @@ -1718,3 +1722,309 @@ def test_bin_pack_arrow_table(arrow_table_with_null: pa.Table) -> None: # and will produce half the number of files if we double the target size bin_packed = bin_pack_arrow_table(bigger_arrow_tbl, target_file_size=arrow_table_with_null.nbytes * 2) assert len(list(bin_packed)) == 5 + + +def test_schema_mismatch_type(table_schema_simple: Schema) -> None: + other_schema = pa.schema(( + pa.field("foo", pa.string(), nullable=True), + pa.field("bar", pa.decimal128(18, 6), nullable=False), + pa.field("baz", pa.bool_(), nullable=True), + )) + + expected = r"""Mismatch in fields: +┏━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +┃ ┃ Table field ┃ Dataframe field ┃ +┡━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ +│ ✅ │ 1: foo: optional string │ 1: foo: optional string │ +│ ❌ │ 2: bar: required int │ 2: bar: required decimal\(18, 6\) │ +│ ✅ │ 3: baz: optional boolean │ 3: baz: optional boolean │ +└────┴──────────────────────────┴─────────────────────────────────┘ +""" + + with pytest.raises(ValueError, match=expected): + _check_pyarrow_schema_compatible(table_schema_simple, other_schema) + + +def test_schema_mismatch_nullability(table_schema_simple: Schema) -> None: + other_schema = pa.schema(( + pa.field("foo", pa.string(), nullable=True), + pa.field("bar", pa.int32(), nullable=True), + pa.field("baz", pa.bool_(), nullable=True), + )) + + expected = """Mismatch in fields: +┏━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +┃ ┃ Table field ┃ Dataframe field ┃ +┡━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━┩ +│ ✅ │ 1: foo: optional string │ 1: foo: optional string │ +│ ❌ │ 2: bar: required int │ 2: bar: optional int │ +│ ✅ │ 3: baz: optional boolean │ 3: baz: optional boolean │ +└────┴──────────────────────────┴──────────────────────────┘ +""" + + with pytest.raises(ValueError, match=expected): + _check_pyarrow_schema_compatible(table_schema_simple, other_schema) + + +def test_schema_compatible_nullability_diff(table_schema_simple: Schema) -> None: + other_schema = pa.schema(( + pa.field("foo", pa.string(), nullable=True), + pa.field("bar", pa.int32(), nullable=False), + pa.field("baz", pa.bool_(), nullable=False), + )) + + try: + _check_pyarrow_schema_compatible(table_schema_simple, other_schema) + except Exception: + pytest.fail("Unexpected Exception raised when calling `_check_pyarrow_schema_compatible`") + + +def test_schema_mismatch_missing_field(table_schema_simple: Schema) -> None: + other_schema = pa.schema(( + pa.field("foo", pa.string(), nullable=True), + pa.field("baz", pa.bool_(), nullable=True), + )) + + expected = """Mismatch in fields: +┏━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +┃ ┃ Table field ┃ Dataframe field ┃ +┡━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━┩ +│ ✅ │ 1: foo: optional string │ 1: foo: optional string │ +│ ❌ │ 2: bar: required int │ Missing │ +│ ✅ │ 3: baz: optional boolean │ 3: baz: optional boolean │ +└────┴──────────────────────────┴──────────────────────────┘ +""" + + with pytest.raises(ValueError, match=expected): + _check_pyarrow_schema_compatible(table_schema_simple, other_schema) + + +def test_schema_compatible_missing_nullable_field_nested(table_schema_nested: Schema) -> None: + schema = table_schema_nested.as_arrow() + schema = schema.remove(6).insert( + 6, + pa.field( + "person", + pa.struct([ + pa.field("age", pa.int32(), nullable=False), + ]), + nullable=True, + ), + ) + try: + _check_pyarrow_schema_compatible(table_schema_nested, schema) + except Exception: + pytest.fail("Unexpected Exception raised when calling `_check_pyarrow_schema_compatible`") + + +def test_schema_mismatch_missing_required_field_nested(table_schema_nested: Schema) -> None: + other_schema = table_schema_nested.as_arrow() + other_schema = other_schema.remove(6).insert( + 6, + pa.field( + "person", + pa.struct([ + pa.field("name", pa.string(), nullable=True), + ]), + nullable=True, + ), + ) + expected = """Mismatch in fields: +┏━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ +┃ ┃ Table field ┃ Dataframe field ┃ +┡━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ +│ ✅ │ 1: foo: optional string │ 1: foo: optional string │ +│ ✅ │ 2: bar: required int │ 2: bar: required int │ +│ ✅ │ 3: baz: optional boolean │ 3: baz: optional boolean │ +│ ✅ │ 4: qux: required list │ 4: qux: required list │ +│ ✅ │ 5: element: required string │ 5: element: required string │ +│ ✅ │ 6: quux: required map> │ map> │ +│ ✅ │ 7: key: required string │ 7: key: required string │ +│ ✅ │ 8: value: required map │ int> │ +│ ✅ │ 9: key: required string │ 9: key: required string │ +│ ✅ │ 10: value: required int │ 10: value: required int │ +│ ✅ │ 11: location: required │ 11: location: required │ +│ │ list> │ float>> │ +│ ✅ │ 12: element: required struct<13: │ 12: element: required struct<13: │ +│ │ latitude: optional float, 14: │ latitude: optional float, 14: │ +│ │ longitude: optional float> │ longitude: optional float> │ +│ ✅ │ 13: latitude: optional float │ 13: latitude: optional float │ +│ ✅ │ 14: longitude: optional float │ 14: longitude: optional float │ +│ ✅ │ 15: person: optional struct<16: │ 15: person: optional struct<16: │ +│ │ name: optional string, 17: age: │ name: optional string> │ +│ │ required int> │ │ +│ ✅ │ 16: name: optional string │ 16: name: optional string │ +│ ❌ │ 17: age: required int │ Missing │ +└────┴────────────────────────────────────┴────────────────────────────────────┘ +""" + + with pytest.raises(ValueError, match=expected): + _check_pyarrow_schema_compatible(table_schema_nested, other_schema) + + +def test_schema_compatible_nested(table_schema_nested: Schema) -> None: + try: + _check_pyarrow_schema_compatible(table_schema_nested, table_schema_nested.as_arrow()) + except Exception: + pytest.fail("Unexpected Exception raised when calling `_check_pyarrow_schema_compatible`") + + +def test_schema_mismatch_additional_field(table_schema_simple: Schema) -> None: + other_schema = pa.schema(( + pa.field("foo", pa.string(), nullable=True), + pa.field("bar", pa.int32(), nullable=False), + pa.field("baz", pa.bool_(), nullable=True), + pa.field("new_field", pa.date32(), nullable=True), + )) + + with pytest.raises( + ValueError, match=r"PyArrow table contains more columns: new_field. Update the schema first \(hint, use union_by_name\)." + ): + _check_pyarrow_schema_compatible(table_schema_simple, other_schema) + + +def test_schema_compatible(table_schema_simple: Schema) -> None: + try: + _check_pyarrow_schema_compatible(table_schema_simple, table_schema_simple.as_arrow()) + except Exception: + pytest.fail("Unexpected Exception raised when calling `_check_pyarrow_schema_compatible`") + + +def test_schema_projection(table_schema_simple: Schema) -> None: + # remove optional `baz` field from `table_schema_simple` + other_schema = pa.schema(( + pa.field("foo", pa.string(), nullable=True), + pa.field("bar", pa.int32(), nullable=False), + )) + try: + _check_pyarrow_schema_compatible(table_schema_simple, other_schema) + except Exception: + pytest.fail("Unexpected Exception raised when calling `_check_pyarrow_schema_compatible`") + + +def test_schema_downcast(table_schema_simple: Schema) -> None: + # large_string type is compatible with string type + other_schema = pa.schema(( + pa.field("foo", pa.large_string(), nullable=True), + pa.field("bar", pa.int32(), nullable=False), + pa.field("baz", pa.bool_(), nullable=True), + )) + + try: + _check_pyarrow_schema_compatible(table_schema_simple, other_schema) + except Exception: + pytest.fail("Unexpected Exception raised when calling `_check_pyarrow_schema_compatible`") + + +def test_partition_for_demo() -> None: + test_pa_schema = pa.schema([("year", pa.int64()), ("n_legs", pa.int64()), ("animal", pa.string())]) + test_schema = Schema( + NestedField(field_id=1, name="year", field_type=StringType(), required=False), + NestedField(field_id=2, name="n_legs", field_type=IntegerType(), required=True), + NestedField(field_id=3, name="animal", field_type=StringType(), required=False), + schema_id=1, + ) + test_data = { + "year": [2020, 2022, 2022, 2022, 2021, 2022, 2022, 2019, 2021], + "n_legs": [2, 2, 2, 4, 4, 4, 4, 5, 100], + "animal": ["Flamingo", "Parrot", "Parrot", "Horse", "Dog", "Horse", "Horse", "Brittle stars", "Centipede"], + } + arrow_table = pa.Table.from_pydict(test_data, schema=test_pa_schema) + partition_spec = PartitionSpec( + PartitionField(source_id=2, field_id=1002, transform=IdentityTransform(), name="n_legs_identity"), + PartitionField(source_id=1, field_id=1001, transform=IdentityTransform(), name="year_identity"), + ) + result = _determine_partitions(partition_spec, test_schema, arrow_table) + assert {table_partition.partition_key.partition for table_partition in result} == { + Record(n_legs_identity=2, year_identity=2020), + Record(n_legs_identity=100, year_identity=2021), + Record(n_legs_identity=4, year_identity=2021), + Record(n_legs_identity=4, year_identity=2022), + Record(n_legs_identity=2, year_identity=2022), + Record(n_legs_identity=5, year_identity=2019), + } + assert ( + pa.concat_tables([table_partition.arrow_table_partition for table_partition in result]).num_rows == arrow_table.num_rows + ) + + +def test_identity_partition_on_multi_columns() -> None: + test_pa_schema = pa.schema([("born_year", pa.int64()), ("n_legs", pa.int64()), ("animal", pa.string())]) + test_schema = Schema( + NestedField(field_id=1, name="born_year", field_type=StringType(), required=False), + NestedField(field_id=2, name="n_legs", field_type=IntegerType(), required=True), + NestedField(field_id=3, name="animal", field_type=StringType(), required=False), + schema_id=1, + ) + # 5 partitions, 6 unique row values, 12 rows + test_rows = [ + (2021, 4, "Dog"), + (2022, 4, "Horse"), + (2022, 4, "Another Horse"), + (2021, 100, "Centipede"), + (None, 4, "Kirin"), + (2021, None, "Fish"), + ] * 2 + expected = {Record(n_legs_identity=test_rows[i][1], year_identity=test_rows[i][0]) for i in range(len(test_rows))} + partition_spec = PartitionSpec( + PartitionField(source_id=2, field_id=1002, transform=IdentityTransform(), name="n_legs_identity"), + PartitionField(source_id=1, field_id=1001, transform=IdentityTransform(), name="year_identity"), + ) + import random + + # there are 12! / ((2!)^6) = 7,484,400 permutations, too many to pick all + for _ in range(1000): + random.shuffle(test_rows) + test_data = { + "born_year": [row[0] for row in test_rows], + "n_legs": [row[1] for row in test_rows], + "animal": [row[2] for row in test_rows], + } + arrow_table = pa.Table.from_pydict(test_data, schema=test_pa_schema) + + result = _determine_partitions(partition_spec, test_schema, arrow_table) + + assert {table_partition.partition_key.partition for table_partition in result} == expected + concatenated_arrow_table = pa.concat_tables([table_partition.arrow_table_partition for table_partition in result]) + assert concatenated_arrow_table.num_rows == arrow_table.num_rows + assert concatenated_arrow_table.sort_by([ + ("born_year", "ascending"), + ("n_legs", "ascending"), + ("animal", "ascending"), + ]) == arrow_table.sort_by([("born_year", "ascending"), ("n_legs", "ascending"), ("animal", "ascending")]) + + +def test__to_requested_schema_timestamps( + arrow_table_schema_with_all_timestamp_precisions: pa.Schema, + arrow_table_with_all_timestamp_precisions: pa.Table, + arrow_table_schema_with_all_microseconds_timestamp_precisions: pa.Schema, + table_schema_with_all_microseconds_timestamp_precision: Schema, +) -> None: + requested_schema = table_schema_with_all_microseconds_timestamp_precision + file_schema = requested_schema + batch = arrow_table_with_all_timestamp_precisions.to_batches()[0] + result = _to_requested_schema(requested_schema, file_schema, batch, downcast_ns_timestamp_to_us=True, include_field_ids=False) + + expected = arrow_table_with_all_timestamp_precisions.cast( + arrow_table_schema_with_all_microseconds_timestamp_precisions, safe=False + ).to_batches()[0] + assert result == expected + + +def test__to_requested_schema_timestamps_without_downcast_raises_exception( + arrow_table_schema_with_all_timestamp_precisions: pa.Schema, + arrow_table_with_all_timestamp_precisions: pa.Table, + arrow_table_schema_with_all_microseconds_timestamp_precisions: pa.Schema, + table_schema_with_all_microseconds_timestamp_precision: Schema, +) -> None: + requested_schema = table_schema_with_all_microseconds_timestamp_precision + file_schema = requested_schema + batch = arrow_table_with_all_timestamp_precisions.to_batches()[0] + with pytest.raises(ValueError) as exc_info: + _to_requested_schema(requested_schema, file_schema, batch, downcast_ns_timestamp_to_us=False, include_field_ids=False) + + assert "Unsupported schema projection from timestamp[ns] to timestamp[us]" in str(exc_info.value) diff --git a/tests/io/test_pyarrow_visitor.py b/tests/io/test_pyarrow_visitor.py index d3b6217c7b..897af1bbbd 100644 --- a/tests/io/test_pyarrow_visitor.py +++ b/tests/io/test_pyarrow_visitor.py @@ -161,22 +161,23 @@ def test_pyarrow_time64_ns_to_iceberg() -> None: visit_pyarrow(pyarrow_type, _ConvertToIceberg()) -def test_pyarrow_timestamp_to_iceberg() -> None: - pyarrow_type = pa.timestamp(unit="us") - converted_iceberg_type = visit_pyarrow(pyarrow_type, _ConvertToIceberg()) +@pytest.mark.parametrize("precision", ["s", "ms", "us", "ns"]) +def test_pyarrow_timestamp_to_iceberg(precision: str) -> None: + pyarrow_type = pa.timestamp(unit=precision) + converted_iceberg_type = visit_pyarrow(pyarrow_type, _ConvertToIceberg(downcast_ns_timestamp_to_us=True)) assert converted_iceberg_type == TimestampType() - assert visit(converted_iceberg_type, _ConvertToArrowSchema()) == pyarrow_type + # all timestamp types are converted to 'us' precision + assert visit(converted_iceberg_type, _ConvertToArrowSchema()) == pa.timestamp(unit="us") def test_pyarrow_timestamp_invalid_units() -> None: - pyarrow_type = pa.timestamp(unit="ms") - with pytest.raises(TypeError, match=re.escape("Unsupported type: timestamp[ms]")): - visit_pyarrow(pyarrow_type, _ConvertToIceberg()) - pyarrow_type = pa.timestamp(unit="s") - with pytest.raises(TypeError, match=re.escape("Unsupported type: timestamp[s]")): - visit_pyarrow(pyarrow_type, _ConvertToIceberg()) pyarrow_type = pa.timestamp(unit="ns") - with pytest.raises(TypeError, match=re.escape("Unsupported type: timestamp[ns]")): + with pytest.raises( + TypeError, + match=re.escape( + "Iceberg does not yet support 'ns' timestamp precision. Use 'downcast-ns-timestamp-to-us-on-write' configuration property to automatically downcast 'ns' to 'us' on write." + ), + ): visit_pyarrow(pyarrow_type, _ConvertToIceberg()) @@ -192,14 +193,13 @@ def test_pyarrow_timestamp_tz_to_iceberg() -> None: def test_pyarrow_timestamp_tz_invalid_units() -> None: - pyarrow_type = pa.timestamp(unit="ms", tz="UTC") - with pytest.raises(TypeError, match=re.escape("Unsupported type: timestamp[ms, tz=UTC]")): - visit_pyarrow(pyarrow_type, _ConvertToIceberg()) - pyarrow_type = pa.timestamp(unit="s", tz="UTC") - with pytest.raises(TypeError, match=re.escape("Unsupported type: timestamp[s, tz=UTC]")): - visit_pyarrow(pyarrow_type, _ConvertToIceberg()) pyarrow_type = pa.timestamp(unit="ns", tz="UTC") - with pytest.raises(TypeError, match=re.escape("Unsupported type: timestamp[ns, tz=UTC]")): + with pytest.raises( + TypeError, + match=re.escape( + "Iceberg does not yet support 'ns' timestamp precision. Use 'downcast-ns-timestamp-to-us-on-write' configuration property to automatically downcast 'ns' to 'us' on write." + ), + ): visit_pyarrow(pyarrow_type, _ConvertToIceberg()) diff --git a/tests/table/test_init.py b/tests/table/test_init.py index d7c4ffeeaf..7a5ea86d7a 100644 --- a/tests/table/test_init.py +++ b/tests/table/test_init.py @@ -19,7 +19,6 @@ from copy import copy from typing import Any, Dict -import pyarrow as pa import pytest from pydantic import ValidationError from sortedcontainers import SortedList @@ -63,8 +62,6 @@ TableIdentifier, UpdateSchema, _apply_table_update, - _check_schema_compatible, - _determine_partitions, _match_deletes_to_data_file, _TableMetadataUpdateContext, update_table_metadata, @@ -88,7 +85,6 @@ BucketTransform, IdentityTransform, ) -from pyiceberg.typedef import Record from pyiceberg.types import ( BinaryType, BooleanType, @@ -1124,96 +1120,6 @@ def test_correct_schema() -> None: assert "Snapshot not found: -1" in str(exc_info.value) -def test_schema_mismatch_type(table_schema_simple: Schema) -> None: - other_schema = pa.schema(( - pa.field("foo", pa.string(), nullable=True), - pa.field("bar", pa.decimal128(18, 6), nullable=False), - pa.field("baz", pa.bool_(), nullable=True), - )) - - expected = r"""Mismatch in fields: -┏━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ -┃ ┃ Table field ┃ Dataframe field ┃ -┡━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ -│ ✅ │ 1: foo: optional string │ 1: foo: optional string │ -│ ❌ │ 2: bar: required int │ 2: bar: required decimal\(18, 6\) │ -│ ✅ │ 3: baz: optional boolean │ 3: baz: optional boolean │ -└────┴──────────────────────────┴─────────────────────────────────┘ -""" - - with pytest.raises(ValueError, match=expected): - _check_schema_compatible(table_schema_simple, other_schema) - - -def test_schema_mismatch_nullability(table_schema_simple: Schema) -> None: - other_schema = pa.schema(( - pa.field("foo", pa.string(), nullable=True), - pa.field("bar", pa.int32(), nullable=True), - pa.field("baz", pa.bool_(), nullable=True), - )) - - expected = """Mismatch in fields: -┏━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┓ -┃ ┃ Table field ┃ Dataframe field ┃ -┡━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━┩ -│ ✅ │ 1: foo: optional string │ 1: foo: optional string │ -│ ❌ │ 2: bar: required int │ 2: bar: optional int │ -│ ✅ │ 3: baz: optional boolean │ 3: baz: optional boolean │ -└────┴──────────────────────────┴──────────────────────────┘ -""" - - with pytest.raises(ValueError, match=expected): - _check_schema_compatible(table_schema_simple, other_schema) - - -def test_schema_mismatch_missing_field(table_schema_simple: Schema) -> None: - other_schema = pa.schema(( - pa.field("foo", pa.string(), nullable=True), - pa.field("baz", pa.bool_(), nullable=True), - )) - - expected = """Mismatch in fields: -┏━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┓ -┃ ┃ Table field ┃ Dataframe field ┃ -┡━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━┩ -│ ✅ │ 1: foo: optional string │ 1: foo: optional string │ -│ ❌ │ 2: bar: required int │ Missing │ -│ ✅ │ 3: baz: optional boolean │ 3: baz: optional boolean │ -└────┴──────────────────────────┴──────────────────────────┘ -""" - - with pytest.raises(ValueError, match=expected): - _check_schema_compatible(table_schema_simple, other_schema) - - -def test_schema_mismatch_additional_field(table_schema_simple: Schema) -> None: - other_schema = pa.schema(( - pa.field("foo", pa.string(), nullable=True), - pa.field("bar", pa.int32(), nullable=True), - pa.field("baz", pa.bool_(), nullable=True), - pa.field("new_field", pa.date32(), nullable=True), - )) - - expected = r"PyArrow table contains more columns: new_field. Update the schema first \(hint, use union_by_name\)." - - with pytest.raises(ValueError, match=expected): - _check_schema_compatible(table_schema_simple, other_schema) - - -def test_schema_downcast(table_schema_simple: Schema) -> None: - # large_string type is compatible with string type - other_schema = pa.schema(( - pa.field("foo", pa.large_string(), nullable=True), - pa.field("bar", pa.int32(), nullable=False), - pa.field("baz", pa.bool_(), nullable=True), - )) - - try: - _check_schema_compatible(table_schema_simple, other_schema) - except Exception: - pytest.fail("Unexpected Exception raised when calling `_check_schema`") - - def test_table_properties(example_table_metadata_v2: Dict[str, Any]) -> None: # metadata properties are all strings for k, v in example_table_metadata_v2["properties"].items(): @@ -1248,85 +1154,3 @@ def test_serialize_commit_table_request() -> None: deserialized_request = CommitTableRequest.model_validate_json(request.model_dump_json()) assert request == deserialized_request - - -def test_partition_for_demo() -> None: - import pyarrow as pa - - test_pa_schema = pa.schema([("year", pa.int64()), ("n_legs", pa.int64()), ("animal", pa.string())]) - test_schema = Schema( - NestedField(field_id=1, name="year", field_type=StringType(), required=False), - NestedField(field_id=2, name="n_legs", field_type=IntegerType(), required=True), - NestedField(field_id=3, name="animal", field_type=StringType(), required=False), - schema_id=1, - ) - test_data = { - "year": [2020, 2022, 2022, 2022, 2021, 2022, 2022, 2019, 2021], - "n_legs": [2, 2, 2, 4, 4, 4, 4, 5, 100], - "animal": ["Flamingo", "Parrot", "Parrot", "Horse", "Dog", "Horse", "Horse", "Brittle stars", "Centipede"], - } - arrow_table = pa.Table.from_pydict(test_data, schema=test_pa_schema) - partition_spec = PartitionSpec( - PartitionField(source_id=2, field_id=1002, transform=IdentityTransform(), name="n_legs_identity"), - PartitionField(source_id=1, field_id=1001, transform=IdentityTransform(), name="year_identity"), - ) - result = _determine_partitions(partition_spec, test_schema, arrow_table) - assert {table_partition.partition_key.partition for table_partition in result} == { - Record(n_legs_identity=2, year_identity=2020), - Record(n_legs_identity=100, year_identity=2021), - Record(n_legs_identity=4, year_identity=2021), - Record(n_legs_identity=4, year_identity=2022), - Record(n_legs_identity=2, year_identity=2022), - Record(n_legs_identity=5, year_identity=2019), - } - assert ( - pa.concat_tables([table_partition.arrow_table_partition for table_partition in result]).num_rows == arrow_table.num_rows - ) - - -def test_identity_partition_on_multi_columns() -> None: - import pyarrow as pa - - test_pa_schema = pa.schema([("born_year", pa.int64()), ("n_legs", pa.int64()), ("animal", pa.string())]) - test_schema = Schema( - NestedField(field_id=1, name="born_year", field_type=StringType(), required=False), - NestedField(field_id=2, name="n_legs", field_type=IntegerType(), required=True), - NestedField(field_id=3, name="animal", field_type=StringType(), required=False), - schema_id=1, - ) - # 5 partitions, 6 unique row values, 12 rows - test_rows = [ - (2021, 4, "Dog"), - (2022, 4, "Horse"), - (2022, 4, "Another Horse"), - (2021, 100, "Centipede"), - (None, 4, "Kirin"), - (2021, None, "Fish"), - ] * 2 - expected = {Record(n_legs_identity=test_rows[i][1], year_identity=test_rows[i][0]) for i in range(len(test_rows))} - partition_spec = PartitionSpec( - PartitionField(source_id=2, field_id=1002, transform=IdentityTransform(), name="n_legs_identity"), - PartitionField(source_id=1, field_id=1001, transform=IdentityTransform(), name="year_identity"), - ) - import random - - # there are 12! / ((2!)^6) = 7,484,400 permutations, too many to pick all - for _ in range(1000): - random.shuffle(test_rows) - test_data = { - "born_year": [row[0] for row in test_rows], - "n_legs": [row[1] for row in test_rows], - "animal": [row[2] for row in test_rows], - } - arrow_table = pa.Table.from_pydict(test_data, schema=test_pa_schema) - - result = _determine_partitions(partition_spec, test_schema, arrow_table) - - assert {table_partition.partition_key.partition for table_partition in result} == expected - concatenated_arrow_table = pa.concat_tables([table_partition.arrow_table_partition for table_partition in result]) - assert concatenated_arrow_table.num_rows == arrow_table.num_rows - assert concatenated_arrow_table.sort_by([ - ("born_year", "ascending"), - ("n_legs", "ascending"), - ("animal", "ascending"), - ]) == arrow_table.sort_by([("born_year", "ascending"), ("n_legs", "ascending"), ("animal", "ascending")]) diff --git a/tests/table/test_name_mapping.py b/tests/table/test_name_mapping.py index d4a2bf6c41..3c50a24e5e 100644 --- a/tests/table/test_name_mapping.py +++ b/tests/table/test_name_mapping.py @@ -91,6 +91,23 @@ def test_json_mapped_field_deserialization() -> None: assert MappedField(field_id=1, names=["id", "record_id"]) == MappedField.model_validate_json(mapped_field_with_null_fields) +def test_json_mapped_field_no_names_deserialization() -> None: + mapped_field = """{ + "field-id": 1, + "names": [] + } + """ + assert MappedField(field_id=1, names=[]) == MappedField.model_validate_json(mapped_field) + + mapped_field_with_null_fields = """{ + "field-id": 1, + "names": [], + "fields": null + } + """ + assert MappedField(field_id=1, names=[]) == MappedField.model_validate_json(mapped_field_with_null_fields) + + def test_json_name_mapping_deserialization() -> None: name_mapping = """ [ @@ -247,11 +264,6 @@ def test_mapping_lookup_by_name(table_name_mapping_nested: NameMapping) -> None: table_name_mapping_nested.find("boom") -def test_invalid_mapped_field() -> None: - with pytest.raises(ValueError): - MappedField(field_id=1, names=[]) - - def test_update_mapping_no_updates_or_adds(table_name_mapping_nested: NameMapping) -> None: assert update_mapping(table_name_mapping_nested, {}, {}) == table_name_mapping_nested diff --git a/tests/table/test_snapshots.py b/tests/table/test_snapshots.py index fa3464052a..ff9d92cea3 100644 --- a/tests/table/test_snapshots.py +++ b/tests/table/test_snapshots.py @@ -314,10 +314,6 @@ def test_invalid_operation() -> None: update_snapshot_summaries(summary=Summary(Operation.REPLACE)) assert "Operation not implemented: Operation.REPLACE" in str(e.value) - with pytest.raises(ValueError) as e: - update_snapshot_summaries(summary=Summary(Operation.DELETE)) - assert "Operation not implemented: Operation.DELETE" in str(e.value) - def test_invalid_type() -> None: with pytest.raises(ValueError) as e: diff --git a/tests/test_types.py b/tests/test_types.py index 1e386bb748..52bdce4de8 100644 --- a/tests/test_types.py +++ b/tests/test_types.py @@ -44,6 +44,7 @@ TimeType, UUIDType, strtobool, + transform_dict_value_to_str, ) non_parameterized_types = [ @@ -649,3 +650,14 @@ def test_strtobool() -> None: for val in invalid_values: with pytest.raises(ValueError, match=f"Invalid truth value: {val!r}"): strtobool(val) + + +def test_transform_dict_value_to_str() -> None: + input_dict = {"key1": 1, "key2": 2.0, "key3": "3", "key4: ": True, "key5": False} + expected_dict = {"key1": "1", "key2": "2.0", "key3": "3", "key4: ": "true", "key5": "false"} + # valid values + assert transform_dict_value_to_str(input_dict) == expected_dict + # Null value not allowed, should raise ValueError + input_dict["key6"] = None + with pytest.raises(ValueError, match="None type is not a supported value in properties: key6"): + transform_dict_value_to_str(input_dict) diff --git a/tests/utils/test_bin_packing.py b/tests/utils/test_bin_packing.py index 054ea79556..3bfacdf481 100644 --- a/tests/utils/test_bin_packing.py +++ b/tests/utils/test_bin_packing.py @@ -20,7 +20,9 @@ import pytest -from pyiceberg.utils.bin_packing import PackingIterator +from pyiceberg.utils.bin_packing import ListPacker, PackingIterator + +INT_MAX = 2147483647 @pytest.mark.parametrize( @@ -83,4 +85,46 @@ def test_bin_packing_lookback( def weight_func(x: int) -> int: return x + packer: ListPacker[int] = ListPacker(target_weight, lookback, largest_bin_first) + assert list(PackingIterator(splits, target_weight, lookback, weight_func, largest_bin_first)) == expected_lists + assert list(packer.pack(splits, weight_func)) == expected_lists + + +@pytest.mark.parametrize( + "splits, target_weight, lookback, largest_bin_first, expected_lists", + [ + # Single Lookback Tests + ([1, 2, 3, 4, 5], 3, 1, False, [[1, 2], [3], [4], [5]]), + ([1, 2, 3, 4, 5], 4, 1, False, [[1, 2], [3], [4], [5]]), + ([1, 2, 3, 4, 5], 5, 1, False, [[1], [2, 3], [4], [5]]), + ([1, 2, 3, 4, 5], 6, 1, False, [[1, 2, 3], [4], [5]]), + ([1, 2, 3, 4, 5], 7, 1, False, [[1, 2], [3, 4], [5]]), + ([1, 2, 3, 4, 5], 8, 1, False, [[1, 2], [3, 4], [5]]), + ([1, 2, 3, 4, 5], 9, 1, False, [[1, 2, 3], [4, 5]]), + ([1, 2, 3, 4, 5], 11, 1, False, [[1, 2, 3], [4, 5]]), + ([1, 2, 3, 4, 5], 12, 1, False, [[1, 2], [3, 4, 5]]), + ([1, 2, 3, 4, 5], 14, 1, False, [[1], [2, 3, 4, 5]]), + ([1, 2, 3, 4, 5], 15, 1, False, [[1, 2, 3, 4, 5]]), + # Unlimited Lookback Tests + ([1, 2, 3, 4, 5], 3, INT_MAX, False, [[1, 2], [3], [4], [5]]), + ([1, 2, 3, 4, 5], 4, INT_MAX, False, [[2], [1, 3], [4], [5]]), + ([1, 2, 3, 4, 5], 5, INT_MAX, False, [[2, 3], [1, 4], [5]]), + ([1, 2, 3, 4, 5], 6, INT_MAX, False, [[3], [2, 4], [1, 5]]), + ([1, 2, 3, 4, 5], 7, INT_MAX, False, [[1], [3, 4], [2, 5]]), + ([1, 2, 3, 4, 5], 8, INT_MAX, False, [[1, 2, 4], [3, 5]]), + ([1, 2, 3, 4, 5], 9, INT_MAX, False, [[1, 2, 3], [4, 5]]), + ([1, 2, 3, 4, 5], 10, INT_MAX, False, [[2, 3], [1, 4, 5]]), + ([1, 2, 3, 4, 5], 11, INT_MAX, False, [[1, 3], [2, 4, 5]]), + ([1, 2, 3, 4, 5], 12, INT_MAX, False, [[1, 2], [3, 4, 5]]), + ([1, 2, 3, 4, 5], 13, INT_MAX, False, [[2], [1, 3, 4, 5]]), + ([1, 2, 3, 4, 5], 14, INT_MAX, False, [[1], [2, 3, 4, 5]]), + ([1, 2, 3, 4, 5], 15, INT_MAX, False, [[1, 2, 3, 4, 5]]), + ], +) +def test_reverse_bin_packing_lookback( + splits: List[int], target_weight: int, lookback: int, largest_bin_first: bool, expected_lists: List[List[int]] +) -> None: + packer: ListPacker[int] = ListPacker(target_weight, lookback, largest_bin_first) + result = packer.pack_end(splits, lambda x: x) + assert result == expected_lists diff --git a/tests/utils/test_manifest.py b/tests/utils/test_manifest.py index 711a697856..82750fe871 100644 --- a/tests/utils/test_manifest.py +++ b/tests/utils/test_manifest.py @@ -37,7 +37,7 @@ write_manifest, write_manifest_list, ) -from pyiceberg.partitioning import PartitionField, PartitionSpec +from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionField, PartitionSpec from pyiceberg.schema import Schema from pyiceberg.table.snapshots import Operation, Snapshot, Summary from pyiceberg.transforms import IdentityTransform @@ -68,7 +68,7 @@ def test_read_manifest_entry(generated_manifest_entry_file: str) -> None: assert manifest_entry.status == ManifestEntryStatus.ADDED assert manifest_entry.snapshot_id == 8744736658442914487 - assert manifest_entry.data_sequence_number == 0 + assert manifest_entry.sequence_number == 0 assert isinstance(manifest_entry.data_file, DataFile) data_file = manifest_entry.data_file @@ -252,7 +252,7 @@ def test_read_manifest_v1(generated_manifest_file_file_v1: str) -> None: entry = entries[0] - assert entry.data_sequence_number == 0 + assert entry.sequence_number == 0 assert entry.file_sequence_number == 0 assert entry.snapshot_id == 8744736658442914487 assert entry.status == ManifestEntryStatus.ADDED @@ -302,12 +302,29 @@ def test_read_manifest_v2(generated_manifest_file_file_v2: str) -> None: entry = entries[0] - assert entry.data_sequence_number == 3 + assert entry.sequence_number == 3 assert entry.file_sequence_number == 3 assert entry.snapshot_id == 8744736658442914487 assert entry.status == ManifestEntryStatus.ADDED +def test_write_empty_manifest() -> None: + io = load_file_io() + test_schema = Schema(NestedField(1, "foo", IntegerType(), False)) + with TemporaryDirectory() as tmpdir: + tmp_avro_file = tmpdir + "/test_write_manifest.avro" + + with pytest.raises(ValueError, match="An empty manifest file has been written"): + with write_manifest( + format_version=1, + spec=UNPARTITIONED_PARTITION_SPEC, + schema=test_schema, + output_file=io.new_output(tmp_avro_file), + snapshot_id=8744736658442914487, + ) as _: + pass + + @pytest.mark.parametrize("format_version", [1, 2]) def test_write_manifest( generated_manifest_file_file_v1: str, generated_manifest_file_file_v2: str, format_version: TableVersion @@ -364,7 +381,7 @@ def test_write_manifest( assert manifest_entry.status == ManifestEntryStatus.ADDED assert manifest_entry.snapshot_id == 8744736658442914487 - assert manifest_entry.data_sequence_number == -1 if format_version == 1 else 3 + assert manifest_entry.sequence_number == -1 if format_version == 1 else 3 assert isinstance(manifest_entry.data_file, DataFile) data_file = manifest_entry.data_file @@ -610,7 +627,7 @@ def test_write_manifest_list( entry = entries[0] - assert entry.data_sequence_number == 0 if format_version == 1 else 3 + assert entry.sequence_number == 0 if format_version == 1 else 3 assert entry.file_sequence_number == 0 if format_version == 1 else 3 assert entry.snapshot_id == 8744736658442914487 assert entry.status == ManifestEntryStatus.ADDED