Skip to content

Commit

Permalink
feat(ingest/glue): allow ingestion of empty databases from Glue (#10666)
Browse files Browse the repository at this point in the history
Co-authored-by: Harshal Sheth <[email protected]>
  • Loading branch information
skrydal and hsheth2 authored Jul 3, 2024
1 parent 226b059 commit 099021c
Show file tree
Hide file tree
Showing 5 changed files with 132 additions and 4 deletions.
7 changes: 3 additions & 4 deletions metadata-ingestion/src/datahub/ingestion/source/aws/glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -949,9 +949,11 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
]

def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
database_seen = set()
databases, tables = self.get_all_databases_and_tables()

for database in databases.values():
yield from self.gen_database_containers(database)

for table in tables:
database_name = table["DatabaseName"]
table_name = table["Name"]
Expand All @@ -962,9 +964,6 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
) or not self.source_config.table_pattern.allowed(full_table_name):
self.report.report_table_dropped(full_table_name)
continue
if database_name not in database_seen:
database_seen.add(database_name)
yield from self.gen_database_containers(databases[database_name])

dataset_urn = make_dataset_urn_with_platform_instance(
platform=self.platform,
Expand Down
56 changes: 56 additions & 0 deletions metadata-ingestion/tests/unit/glue/glue_mces_golden.json
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,59 @@
}
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:110bc08849d1c1bde5fc345dab5c3ae7",
"changeType": "UPSERT",
"aspectName": "containerProperties",
"aspect": {
"json": {
"customProperties": {
"platform": "glue",
"env": "PROD",
"database": "empty-database",
"CreateTime": "June 1, 2021 at 14:55:13"
},
"name": "empty-database",
"qualifiedName": "arn:aws:glue:us-west-2:123412341234:database/empty-database"
}
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:110bc08849d1c1bde5fc345dab5c3ae7",
"changeType": "UPSERT",
"aspectName": "status",
"aspect": {
"json": {
"removed": false
}
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:110bc08849d1c1bde5fc345dab5c3ae7",
"changeType": "UPSERT",
"aspectName": "dataPlatformInstance",
"aspect": {
"json": {
"platform": "urn:li:dataPlatform:glue"
}
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:110bc08849d1c1bde5fc345dab5c3ae7",
"changeType": "UPSERT",
"aspectName": "subTypes",
"aspect": {
"json": {
"typeNames": [
"Database"
]
}
}
},
{
"proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
Expand Down Expand Up @@ -236,6 +289,7 @@
"type": "DATAOWNER"
}
],
"ownerTypes": {},
"lastModified": {
"time": 0,
"actor": "urn:li:corpuser:unknown"
Expand Down Expand Up @@ -473,6 +527,7 @@
"type": "DATAOWNER"
}
],
"ownerTypes": {},
"lastModified": {
"time": 0,
"actor": "urn:li:corpuser:unknown"
Expand Down Expand Up @@ -658,6 +713,7 @@
"type": "DATAOWNER"
}
],
"ownerTypes": {},
"lastModified": {
"time": 0,
"actor": "urn:li:corpuser:unknown"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,61 @@
}
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:ac4381240e82d55400c22e4392e744a4",
"changeType": "UPSERT",
"aspectName": "containerProperties",
"aspect": {
"json": {
"customProperties": {
"platform": "glue",
"instance": "some_instance_name",
"env": "PROD",
"database": "empty-database",
"CreateTime": "June 1, 2021 at 14:55:13"
},
"name": "empty-database",
"qualifiedName": "arn:aws:glue:us-west-2:123412341234:database/empty-database"
}
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:ac4381240e82d55400c22e4392e744a4",
"changeType": "UPSERT",
"aspectName": "status",
"aspect": {
"json": {
"removed": false
}
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:ac4381240e82d55400c22e4392e744a4",
"changeType": "UPSERT",
"aspectName": "dataPlatformInstance",
"aspect": {
"json": {
"platform": "urn:li:dataPlatform:glue",
"instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:glue,some_instance_name)"
}
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:ac4381240e82d55400c22e4392e744a4",
"changeType": "UPSERT",
"aspectName": "subTypes",
"aspect": {
"json": {
"typeNames": [
"Database"
]
}
}
},
{
"proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
Expand Down
5 changes: 5 additions & 0 deletions metadata-ingestion/tests/unit/test_glue_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,11 @@ def test_glue_ingest(
get_tables_response_2,
{"DatabaseName": "test-database"},
)
glue_stubber.add_response(
"get_tables",
{"TableList": []},
{"DatabaseName": "empty-database"},
)
glue_stubber.add_response("get_jobs", get_jobs_response, {})
glue_stubber.add_response(
"get_dataflow_graph",
Expand Down
13 changes: 13 additions & 0 deletions metadata-ingestion/tests/unit/test_glue_source_stubs.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,19 @@
],
"CatalogId": "123412341234",
},
{
"Name": "empty-database",
"CreateTime": datetime.datetime(2021, 6, 1, 14, 55, 13),
"CreateTableDefaultPermissions": [
{
"Principal": {
"DataLakePrincipalIdentifier": "IAM_ALLOWED_PRINCIPALS"
},
"Permissions": ["ALL"],
}
],
"CatalogId": "123412341234",
},
]
}
databases_1 = {
Expand Down

0 comments on commit 099021c

Please sign in to comment.