Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Glue endpoint config variable, continue #530 #920

Merged
merged 7 commits into from
Jul 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions mkdocs/docs/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,16 @@ catalog:
region_name: <REGION_NAME>
```

<!-- markdown-link-check-disable -->

| Key | Example | Description |
| ----------------- | ------------------------------------ | ------------------------------------------------------------------------------- |
| glue.id | 111111111111 | Configure the 12-digit ID of the Glue Catalog |
| glue.skip-archive | true | Configure whether to skip the archival of older table versions. Default to true |
| glue.endpoint | https://glue.us-east-1.amazonaws.com | Configure an alternative endpoint of the Glue service for GlueCatalog to access |

<!-- markdown-link-check-enable-->

## DynamoDB Catalog

If you want to use AWS DynamoDB as the catalog, you can use the last two ways to configure the pyiceberg and refer
Expand Down
6 changes: 5 additions & 1 deletion pyiceberg/catalog/glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,10 @@
GLUE_SKIP_ARCHIVE = "glue.skip-archive"
GLUE_SKIP_ARCHIVE_DEFAULT = True

# Configure an alternative endpoint of the Glue service for GlueCatalog to access.
# This could be used to use GlueCatalog with any glue-compatible metastore service that has a different endpoint
GLUE_CATALOG_ENDPOINT = "glue.endpoint"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


ICEBERG_FIELD_ID = "iceberg.field.id"
ICEBERG_FIELD_OPTIONAL = "iceberg.field.optional"
ICEBERG_FIELD_CURRENT = "iceberg.field.current"
Expand Down Expand Up @@ -289,7 +293,7 @@ def __init__(self, name: str, **properties: Any):
aws_secret_access_key=properties.get("aws_secret_access_key"),
aws_session_token=properties.get("aws_session_token"),
)
self.glue: GlueClient = session.client("glue")
self.glue: GlueClient = session.client("glue", endpoint_url=properties.get(GLUE_CATALOG_ENDPOINT))

if glue_catalog_id := properties.get(GLUE_ID):
_register_glue_catalog_id_with_glue_client(self.glue, glue_catalog_id)
Expand Down
8 changes: 5 additions & 3 deletions tests/catalog/integration_test_glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from botocore.exceptions import ClientError

from pyiceberg.catalog import Catalog, MetastoreCatalog
from pyiceberg.catalog.glue import GlueCatalog
from pyiceberg.catalog.glue import GLUE_CATALOG_ENDPOINT, GlueCatalog
from pyiceberg.exceptions import (
NamespaceAlreadyExistsError,
NamespaceNotEmptyError,
Expand All @@ -36,7 +36,7 @@
from pyiceberg.io.pyarrow import _dataframe_to_data_files, schema_to_pyarrow
from pyiceberg.schema import Schema
from pyiceberg.types import IntegerType
from tests.conftest import clean_up, get_bucket_name, get_s3_path
from tests.conftest import clean_up, get_bucket_name, get_glue_endpoint, get_s3_path

# The number of tables/databases used in list_table/namespace test
LIST_TEST_NUMBER = 2
Expand All @@ -51,7 +51,9 @@ def fixture_glue_client() -> boto3.client:
@pytest.fixture(name="test_catalog", scope="module")
def fixture_test_catalog() -> Generator[Catalog, None, None]:
"""Configure the pre- and post-setting of aws integration test."""
test_catalog = GlueCatalog(CATALOG_NAME, warehouse=get_s3_path(get_bucket_name()))
test_catalog = GlueCatalog(
CATALOG_NAME, **{"warehouse": get_s3_path(get_bucket_name()), GLUE_CATALOG_ENDPOINT: get_glue_endpoint()}
)
yield test_catalog
clean_up(test_catalog)

Expand Down
10 changes: 10 additions & 0 deletions tests/catalog/test_glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -862,3 +862,13 @@ def test_register_table_with_given_location(
table = test_catalog.register_table(identifier, location)
assert table.identifier == (catalog_name,) + identifier
assert test_catalog.table_exists(identifier) is True


@mock_aws
def test_glue_endpoint_override(_bucket_initialize: None, moto_endpoint_url: str, database_name: str) -> None:
catalog_name = "glue"
test_endpoint = "https://test-endpoint"
test_catalog = GlueCatalog(
catalog_name, **{"s3.endpoint": moto_endpoint_url, "warehouse": f"s3://{BUCKET_NAME}", "glue.endpoint": test_endpoint}
)
assert test_catalog.glue.meta.endpoint_url == test_endpoint
5 changes: 5 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2043,6 +2043,11 @@ def get_bucket_name() -> str:
return bucket_name


def get_glue_endpoint() -> Optional[str]:
"""Set the optional environment variable AWS_TEST_GLUE_ENDPOINT for a glue endpoint to test."""
return os.getenv("AWS_TEST_GLUE_ENDPOINT")


def get_s3_path(bucket_name: str, database_name: Optional[str] = None, table_name: Optional[str] = None) -> str:
result_path = f"s3://{bucket_name}"
if database_name is not None:
Expand Down