From d77d6c9ca60d5ec55c4af77870555373f4aed828 Mon Sep 17 00:00:00 2001 From: Seb Pretzer <24555985+sebpretzer@users.noreply.github.com> Date: Mon, 18 Mar 2024 12:36:36 -0400 Subject: [PATCH 1/6] adding test for glue endpoint override --- pyiceberg/catalog/glue.py | 6 +++++- tests/catalog/test_glue.py | 16 ++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/pyiceberg/catalog/glue.py b/pyiceberg/catalog/glue.py index adec150d84..6d87a68b62 100644 --- a/pyiceberg/catalog/glue.py +++ b/pyiceberg/catalog/glue.py @@ -105,6 +105,10 @@ GLUE_SKIP_ARCHIVE = "glue.skip-archive" GLUE_SKIP_ARCHIVE_DEFAULT = True +# Configure an alternative endpoint of the Glue service for GlueCatalog to access. +# This could be used to use GlueCatalog with any glue-compatible metastore service that has a different endpoint +GLUE_CATALOG_ENDPOINT = "glue.endpoint" + ICEBERG_FIELD_ID = "iceberg.field.id" ICEBERG_FIELD_OPTIONAL = "iceberg.field.optional" ICEBERG_FIELD_CURRENT = "iceberg.field.current" @@ -285,7 +289,7 @@ def __init__(self, name: str, **properties: Any): aws_secret_access_key=properties.get("aws_secret_access_key"), aws_session_token=properties.get("aws_session_token"), ) - self.glue: GlueClient = session.client("glue") + self.glue: GlueClient = session.client("glue", endpoint_url=properties.get(GLUE_CATALOG_ENDPOINT)) if glue_catalog_id := properties.get(GLUE_ID): _register_glue_catalog_id_with_glue_client(self.glue, glue_catalog_id) diff --git a/tests/catalog/test_glue.py b/tests/catalog/test_glue.py index 6d44d92724..10815b63ff 100644 --- a/tests/catalog/test_glue.py +++ b/tests/catalog/test_glue.py @@ -692,3 +692,19 @@ def test_commit_table_properties( updated_table_metadata = table.metadata assert test_catalog._parse_metadata_version(table.metadata_location) == 1 assert updated_table_metadata.properties == {"test_a": "test_aa", "test_c": "test_c"} + + +def test_glue_endpoint_override(moto_endpoint_url: str, database_name: str) -> None: + catalog_name = "glue" + test_catalog = GlueCatalog( + catalog_name, **{"s3.endpoint": moto_endpoint_url, "warehouse": f"s3://{BUCKET_NAME}", "glue.endpoint": moto_endpoint_url} + ) + assert test_catalog.glue.meta.endpoint_url == moto_endpoint_url + + test_catalog.create_namespace(namespace=database_name) + assert (database_name,) in test_catalog.list_namespaces() + + with mock_aws(): + other_catalog = GlueCatalog(catalog_name, **{"s3.endpoint": moto_endpoint_url, "warehouse": f"s3://{BUCKET_NAME}"}) + assert other_catalog.glue.meta.endpoint_url != moto_endpoint_url + assert (database_name,) not in other_catalog.list_namespaces() From 19e970c3a1fe9d8392f8bfd90cdcf39f70711b3f Mon Sep 17 00:00:00 2001 From: Seb Pretzer <24555985+sebpretzer@users.noreply.github.com> Date: Mon, 18 Mar 2024 13:05:32 -0400 Subject: [PATCH 2/6] removing unnecessary code --- tests/catalog/test_glue.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/tests/catalog/test_glue.py b/tests/catalog/test_glue.py index 10815b63ff..1c12066945 100644 --- a/tests/catalog/test_glue.py +++ b/tests/catalog/test_glue.py @@ -700,11 +700,3 @@ def test_glue_endpoint_override(moto_endpoint_url: str, database_name: str) -> N catalog_name, **{"s3.endpoint": moto_endpoint_url, "warehouse": f"s3://{BUCKET_NAME}", "glue.endpoint": moto_endpoint_url} ) assert test_catalog.glue.meta.endpoint_url == moto_endpoint_url - - test_catalog.create_namespace(namespace=database_name) - assert (database_name,) in test_catalog.list_namespaces() - - with mock_aws(): - other_catalog = GlueCatalog(catalog_name, **{"s3.endpoint": moto_endpoint_url, "warehouse": f"s3://{BUCKET_NAME}"}) - assert other_catalog.glue.meta.endpoint_url != moto_endpoint_url - assert (database_name,) not in other_catalog.list_namespaces() From b9c65838693ba8c8b724f0bfa5133ee35ddaa958 Mon Sep 17 00:00:00 2001 From: HonahX Date: Thu, 11 Jul 2024 23:20:30 -0700 Subject: [PATCH 3/6] add integration test config --- tests/catalog/integration_test_glue.py | 8 +++++--- tests/conftest.py | 5 +++++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/tests/catalog/integration_test_glue.py b/tests/catalog/integration_test_glue.py index c69bc86ca8..a5293e38f2 100644 --- a/tests/catalog/integration_test_glue.py +++ b/tests/catalog/integration_test_glue.py @@ -25,7 +25,7 @@ from botocore.exceptions import ClientError from pyiceberg.catalog import Catalog, MetastoreCatalog -from pyiceberg.catalog.glue import GlueCatalog +from pyiceberg.catalog.glue import GLUE_CATALOG_ENDPOINT, GlueCatalog from pyiceberg.exceptions import ( NamespaceAlreadyExistsError, NamespaceNotEmptyError, @@ -36,7 +36,7 @@ from pyiceberg.io.pyarrow import _dataframe_to_data_files, schema_to_pyarrow from pyiceberg.schema import Schema from pyiceberg.types import IntegerType -from tests.conftest import clean_up, get_bucket_name, get_s3_path +from tests.conftest import clean_up, get_bucket_name, get_glue_endpoint, get_s3_path # The number of tables/databases used in list_table/namespace test LIST_TEST_NUMBER = 2 @@ -51,7 +51,9 @@ def fixture_glue_client() -> boto3.client: @pytest.fixture(name="test_catalog", scope="module") def fixture_test_catalog() -> Generator[Catalog, None, None]: """Configure the pre- and post-setting of aws integration test.""" - test_catalog = GlueCatalog(CATALOG_NAME, warehouse=get_s3_path(get_bucket_name())) + test_catalog = GlueCatalog( + CATALOG_NAME, **{"warehouse": get_s3_path(get_bucket_name()), GLUE_CATALOG_ENDPOINT: get_glue_endpoint()} + ) yield test_catalog clean_up(test_catalog) diff --git a/tests/conftest.py b/tests/conftest.py index 95e1128af6..c1d7ad007c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2043,6 +2043,11 @@ def get_bucket_name() -> str: return bucket_name +def get_glue_endpoint() -> Optional[str]: + """Set the optional environment variable AWS_TEST_GLUE_ENDPOINT for a glue endpoint to test.""" + return os.getenv("AWS_TEST_GLUE_ENDPOINT") + + def get_s3_path(bucket_name: str, database_name: Optional[str] = None, table_name: Optional[str] = None) -> str: result_path = f"s3://{bucket_name}" if database_name is not None: From 3aec52beec8442b0312f104cae425e14dc416c3f Mon Sep 17 00:00:00 2001 From: HonahX Date: Thu, 11 Jul 2024 23:48:59 -0700 Subject: [PATCH 4/6] add doc --- mkdocs/docs/configuration.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/mkdocs/docs/configuration.md b/mkdocs/docs/configuration.md index 5346e82c25..76e1816c3a 100644 --- a/mkdocs/docs/configuration.md +++ b/mkdocs/docs/configuration.md @@ -288,6 +288,16 @@ catalog: region_name: ``` + + +| Key | Example | Description | +| ----------------- | ------------------------------------ | ------------------------------------------------------------------------------- | +| glue.id | 111111111111 | Configure the 12-digit ID of the Glue Catalog | +| glue.skip-archive | true | Configure whether to skip the archival of older table versions. Default to true | +| glue.endpoint | https://glue.us-east-1.amazonaws.com | Configure an alternative endpoint of the Glue service for GlueCatalog to access | + + + ## DynamoDB Catalog If you want to use AWS DynamoDB as the catalog, you can use the last two ways to configure the pyiceberg and refer From fb3f116c933f7a95630ab2d8faa4bebe0468137a Mon Sep 17 00:00:00 2001 From: HonahX Date: Fri, 12 Jul 2024 00:56:56 -0700 Subject: [PATCH 5/6] add missing mock_aws --- tests/catalog/test_glue.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/catalog/test_glue.py b/tests/catalog/test_glue.py index 739ed63dbb..35044c4c21 100644 --- a/tests/catalog/test_glue.py +++ b/tests/catalog/test_glue.py @@ -864,6 +864,7 @@ def test_register_table_with_given_location( assert test_catalog.table_exists(identifier) is True +@mock_aws def test_glue_endpoint_override(moto_endpoint_url: str, database_name: str) -> None: catalog_name = "glue" test_catalog = GlueCatalog( From b4feb335b99a70cd833b7518b4b584468fdb479b Mon Sep 17 00:00:00 2001 From: HonahX Date: Fri, 12 Jul 2024 12:43:35 -0700 Subject: [PATCH 6/6] make test happy --- tests/catalog/test_glue.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/catalog/test_glue.py b/tests/catalog/test_glue.py index 35044c4c21..c4afa50c52 100644 --- a/tests/catalog/test_glue.py +++ b/tests/catalog/test_glue.py @@ -865,9 +865,10 @@ def test_register_table_with_given_location( @mock_aws -def test_glue_endpoint_override(moto_endpoint_url: str, database_name: str) -> None: +def test_glue_endpoint_override(_bucket_initialize: None, moto_endpoint_url: str, database_name: str) -> None: catalog_name = "glue" + test_endpoint = "https://test-endpoint" test_catalog = GlueCatalog( - catalog_name, **{"s3.endpoint": moto_endpoint_url, "warehouse": f"s3://{BUCKET_NAME}", "glue.endpoint": moto_endpoint_url} + catalog_name, **{"s3.endpoint": moto_endpoint_url, "warehouse": f"s3://{BUCKET_NAME}", "glue.endpoint": test_endpoint} ) - assert test_catalog.glue.meta.endpoint_url == moto_endpoint_url + assert test_catalog.glue.meta.endpoint_url == test_endpoint