Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DRAFT] Dataset factory parsing rules demo #2559

Closed
wants to merge 3 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 63 additions & 0 deletions kedro/framework/cli/catalog.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
"""A collection of CLI commands for working with Kedro catalog."""
import copy
from collections import defaultdict

import click
import yaml
from click import secho
from parse import parse

from kedro.framework.cli.utils import KedroCliError, env_option, split_string
from kedro.framework.project import pipelines, settings
Expand Down Expand Up @@ -174,3 +176,64 @@ def _add_missing_datasets_to_catalog(missing_ds, catalog_path):
catalog_path.parent.mkdir(exist_ok=True)
with catalog_path.open(mode="w") as catalog_file:
yaml.safe_dump(catalog_config, catalog_file, default_flow_style=False)


def pick_best_match(matches):
matches = sorted(matches, key=lambda x: (specificity(x[0]), -x[0].count("{"), x[0]))
return matches[0]


def specificity(pattern):
Copy link
Member

@merelcht merelcht May 10, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this method just counting the number of characters that are not in brackets? Wouldn't it be easier to just remove the text in {} and then get the length of the string?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is what is happening here but using parse instead of a regex.

"""This function will check length of exactly matched characters not inside brackets
Example -
specificity("{namespace}.companies") = 10
specificity("{namespace}.{dataset}") = 1
specificity("france.companies") = 16
"""
pattern_variables = parse(pattern, pattern).named
for k in pattern_variables:
pattern_variables[k] = ""
specific_characters = pattern.format(**pattern_variables)
return -len(specific_characters)


@catalog.command("resolve")
@env_option
@click.pass_obj
def resolve_catalog_datasets(metadata: ProjectMetadata, env):
session = _create_session(metadata.package_name, env=env)
context = session.load_context()
catalog_conf = context.config_loader["catalog"]

# Create a list of all datasets used in the project pipelines.
pipeline_datasets = []
for _, pl_obj in pipelines.items():
pipeline_ds = pl_obj.data_sets()
for dataset in pipeline_ds:
pipeline_datasets.append(dataset)
pipeline_datasets = set(pipeline_datasets)
result_catalog = {}
for pipeline_dataset in pipeline_datasets:
matches = []
for ds_name in catalog_conf.keys():
result = parse(ds_name, pipeline_dataset)
if not result:
continue
# We have found a match!
matches.append((ds_name, result))
if len(matches) == 0:
# print(f"skipping {pipeline_dataset} -> maybe params or MemoryDataSet")
continue
best_match, result = pick_best_match(matches)
best_match_config = copy.deepcopy(catalog_conf[best_match])
# Match results to patterns in best matching catalog entry
for key, value in best_match_config.items():
string_value = str(value)
try:
formatted_string = string_value.format_map(result.named)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should only happen for dataset fields with {} otherwise you get weird things like:

shuttles:
  filepath: data/01_raw/shuttles.xlsx
  load_args: data/01_raw/shuttles.xlsx
  type: pandas.ExcelDataSet

But that's not really part of the matching logic, so not super important here.

Copy link
Contributor Author

@ankatiyar ankatiyar May 12, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can be changed to only match with patterns containing {} but this logic will still work for dataset entries that are not patterns(exact matches) because of how parse works -
parse("xyz", "abc") returns None (not a match at all - ignore)
parse("xyz", "xyz") returns <Result () {}> (exact match, not a pattern - result is not empty so still added to matches list)
parse("hello {name}", "hello world") returns <Result () {'name': 'world'}> (potential match- added to matches list)

So the exact match then gets chosen as the best match and since it wouldn't contain any brackets anywhere in the catalog entry, it would not be changed at all.

except KeyError:
# Dataset config has a placeholder which is not present in the ds name
print(f"'{key}' has invalid catalog configuration")
best_match_config[key] = formatted_string
result_catalog[pipeline_dataset] = best_match_config
secho(yaml.dump(result_catalog))