From 000dd627501381c2cb408870472aff2d0be3e07f Mon Sep 17 00:00:00 2001 From: Ankita Katiyar Date: Fri, 5 May 2023 09:52:57 +0100 Subject: [PATCH 1/3] Add catalog resolve fn Signed-off-by: Ankita Katiyar --- kedro/framework/cli/catalog.py | 50 ++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/kedro/framework/cli/catalog.py b/kedro/framework/cli/catalog.py index 5fd64fdd43..860a8620e5 100644 --- a/kedro/framework/cli/catalog.py +++ b/kedro/framework/cli/catalog.py @@ -1,9 +1,11 @@ """A collection of CLI commands for working with Kedro catalog.""" +import copy from collections import defaultdict import click import yaml from click import secho +from parse import parse from kedro.framework.cli.utils import KedroCliError, env_option, split_string from kedro.framework.project import pipelines, settings @@ -174,3 +176,51 @@ def _add_missing_datasets_to_catalog(missing_ds, catalog_path): catalog_path.parent.mkdir(exist_ok=True) with catalog_path.open(mode="w") as catalog_file: yaml.safe_dump(catalog_config, catalog_file, default_flow_style=False) + + +def pick_best_match(matches): + # according to number of { then alphabetical + matches = sorted(matches, key=lambda x: (x[0].count("}"), x[0])) + return matches[0] + + +@catalog.command("resolve") +@env_option +@click.pass_obj +def resolve_catalog_datasets(metadata: ProjectMetadata, env): + session = _create_session(metadata.package_name, env=env) + context = session.load_context() + catalog_conf = context.config_loader["catalog"] + + # Create a list of all datasets used in the project pipelines. + pipeline_datasets = [] + for _, pl_obj in pipelines.items(): + pipeline_ds = pl_obj.data_sets() + for dataset in pipeline_ds: + pipeline_datasets.append(dataset) + pipeline_datasets = set(pipeline_datasets) + result_catalog = {} + for pipeline_dataset in pipeline_datasets: + matches = [] + for ds_name in catalog_conf.keys(): + result = parse(ds_name, pipeline_dataset) + if not result: + continue + # We have found a match! + matches.append((ds_name, result)) + if len(matches) == 0: + # print(f"skipping {pipeline_dataset} -> maybe params or MemoryDataSet") + continue + best_match, result = pick_best_match(matches) + best_match_config = copy.deepcopy(catalog_conf[best_match]) + # Match results to patterns in best matching catalog entry + for key, value in best_match_config.items(): + string_value = str(value) + try: + formatted_string = string_value.format_map(result.named) + except KeyError: + # Dataset config has a placeholder which is not present in the ds name + print(f"'{key}' has invalid catalog configuration") + best_match_config[key] = formatted_string + result_catalog[pipeline_dataset] = best_match_config + secho(yaml.dump(result_catalog)) From 5c04432c97bc9be0bf1af9d3d3f281c833dc9d12 Mon Sep 17 00:00:00 2001 From: Ankita Katiyar Date: Fri, 5 May 2023 15:22:42 +0100 Subject: [PATCH 2/3] Add specificity fn Signed-off-by: Ankita Katiyar --- kedro/framework/cli/catalog.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/kedro/framework/cli/catalog.py b/kedro/framework/cli/catalog.py index 860a8620e5..b540e03fd5 100644 --- a/kedro/framework/cli/catalog.py +++ b/kedro/framework/cli/catalog.py @@ -179,11 +179,24 @@ def _add_missing_datasets_to_catalog(missing_ds, catalog_path): def pick_best_match(matches): - # according to number of { then alphabetical - matches = sorted(matches, key=lambda x: (x[0].count("}"), x[0])) + matches = sorted(matches, key=lambda x: (specificity(x[0]), x[0])) return matches[0] +def specificity(pattern): + """This function will check length of exactly matched characters not inside brackets + Example - + specificity("{namespace}.companies") = 10 + specificity("{namespace}.{dataset}") = 1 + specificity("france.companies") = 16 + """ + pattern_variables = parse(pattern, pattern).named + for k in pattern_variables: + pattern_variables[k] = "" + specific_characters = pattern.format(**pattern_variables) + return -len(specific_characters) + + @catalog.command("resolve") @env_option @click.pass_obj From a9f633c5f6226e7a58e71e53919af8142ad7bfb1 Mon Sep 17 00:00:00 2001 From: Ankita Katiyar Date: Fri, 5 May 2023 17:43:23 +0100 Subject: [PATCH 3/3] remove print statement Signed-off-by: Ankita Katiyar --- kedro/framework/cli/catalog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kedro/framework/cli/catalog.py b/kedro/framework/cli/catalog.py index b540e03fd5..c9a58af432 100644 --- a/kedro/framework/cli/catalog.py +++ b/kedro/framework/cli/catalog.py @@ -179,7 +179,7 @@ def _add_missing_datasets_to_catalog(missing_ds, catalog_path): def pick_best_match(matches): - matches = sorted(matches, key=lambda x: (specificity(x[0]), x[0])) + matches = sorted(matches, key=lambda x: (specificity(x[0]), -x[0].count("{"), x[0])) return matches[0]