From c7f59d38909ebfb989edba9337901ed739411eb7 Mon Sep 17 00:00:00 2001 From: joelmata Date: Mon, 1 Jul 2024 14:21:07 +0200 Subject: [PATCH] Formatting with Black --- .../src/datahub/ingestion/source/abs/source.py | 14 +++----------- .../src/datahub/ingestion/source/azure/abs_util.py | 12 ++---------- 2 files changed, 5 insertions(+), 21 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/abs/source.py b/metadata-ingestion/src/datahub/ingestion/source/abs/source.py index 9168033f5b601..60b3bea0ba746 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/abs/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/abs/source.py @@ -540,15 +540,6 @@ def abs_browser( matches = re.finditer(r"{\s*\w+\s*}", path_spec.include, re.MULTILINE) matches_list = list(matches) if matches_list and path_spec.sample_files: - # TODO refactor for abs - # Replace the patch_spec include's templates with star because later we want to resolve all the stars - # to actual directories. - # For example: - # "s3://my-test-bucket/*/{dept}/*/{table}/*/*.*" -> "s3://my-test-bucket/*/*/*/{table}/*/*.*" - # We only keep the last template as a marker to know the point util we need to resolve path. - # After the marker we can safely get sample files for sampling because it is not used in the - # table name, so we don't need all the files. - # This speed up processing but we won't be able to get a precise modification date/size/number of files. max_start: int = -1 include: str = path_spec.include max_match: str = "" @@ -580,8 +571,9 @@ def abs_browser( ) logger.info(f"Getting files from folder: {dir_to_process}") dir_to_process = dir_to_process.rstrip("\\") - for obj in ( - container_client.list_blobs(name_starts_with=f"{dir_to_process}", results_per_page=PAGE_SIZE) + for obj in container_client.list_blobs( + name_starts_with=f"{dir_to_process}", + results_per_page=PAGE_SIZE, ): abs_path = self.create_abs_path(obj.name) logger.debug(f"Sampling file: {abs_path}") diff --git a/metadata-ingestion/src/datahub/ingestion/source/azure/abs_util.py b/metadata-ingestion/src/datahub/ingestion/source/azure/abs_util.py index 3c661c2e509e9..180e1b51dfa2e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/azure/abs_util.py +++ b/metadata-ingestion/src/datahub/ingestion/source/azure/abs_util.py @@ -11,15 +11,6 @@ from datahub.ingestion.source.azure.azure_common import AzureConnectionConfig from datahub.metadata.schema_classes import GlobalTagsClass, TagAssociationClass -# TODO -# Azure blob storage URIs: -# https://..core.windows.net/containername/ -# where type is in [blob, web, dfs, file, queue, table] -# unknown: -# - what types are supported by the DSH -# - what types are supported by Datahub -# https://learn.microsoft.com/en-us/azure/storage/common/storage-account-overview#types-of-storage-accounts - ABS_PREFIXES_REGEX = re.compile( r"(http[s]?://[a-z0-9]{3,24}\.blob\.core\.windows\.net/)" ) @@ -180,6 +171,7 @@ def create_properties( prefix=f"{prefix}_{key}", custom_properties=custom_properties, resource_name=resource_name, + json_properties=json_properties ) else: custom_properties = add_property( @@ -259,7 +251,7 @@ def list_folders( this_dict = {} for blob in blob_list: - blob_name = blob.name[:blob.name.rfind("/")+1] + blob_name = blob.name[: blob.name.rfind("/") + 1] folder_structure_arr = blob_name.split("/") folder_name = ""